openoker
/
ray


			
				
					
						
						
							1234567891011121314151617181920
							# This configuration can expect to reach -160 reward in 10k-20k timesteps
pendulum-td3:
    env: Pendulum-v1
    run: TD3
    stop:
        episode_reward_mean: -900
        timesteps_total: 100000
    config:
        # Works for both torch and tf.
        framework: tf
        # === Model ===
        actor_hiddens: [64, 64]
        critic_hiddens: [64, 64]
        # === Exploration ===
        learning_starts: 5000
        exploration_config:
            random_timesteps: 5000
        # === Evaluation ===
        evaluation_interval: 1
        evaluation_num_episodes: 5