openoker
/
ray


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536
							# Given a SAC-generated offline file generated via:
# rllib train -f tuned_examples/sac/pendulum-sac.yaml --no-ray-ui

# Pendulum CQL can attain ~ -300 reward in 10k from that file.
pendulum-cql:
    env: Pendulum-v1
    run: CQL
    stop:
        evaluation/episode_reward_mean: -700
        timesteps_total: 200000
    config:
        # Works for both torch and tf.
        framework: tf

        # Use one or more offline files or "input: sampler" for online learning.
        input: ["tests/data/pendulum/enormous.zip"]
        # Our input file above comes from an SAC run. Actions in there
        # are already normalized (produced by SquashedGaussian).
        actions_in_input_normalized: true
        clip_actions: true

        twin_q: true
        train_batch_size: 2000
        learning_starts: 0
        bc_iters: 100

        metrics_smoothing_episodes: 5

        # Evaluate in an actual environment.
        evaluation_interval: 1
        evaluation_num_workers: 2
        evaluation_num_episodes: 10
        evaluation_parallel_to_training: true
        evaluation_config:
            input: sampler
            explore: False