openoker
/
ray


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445
							cartpole-crashing-pg:
    env: ray.rllib.examples.env.cartpole_crashing.CartPoleCrashing
    run: PG
    stop:
        evaluation/sampler_results/episode_reward_mean: 150.0
        num_env_steps_sampled: 150000
    config:
        # Works for both torch and tf.
        framework: torch

        env_config:
            config:
                # Crash roughly every 300 ts. This should be ok to measure 180.0
                # reward (episodes are 200 ts long).
                p_crash: 0.0025  # prob to crash during step()
                p_crash_reset: 0.01  # prob to crash during reset()
                # Time for the env to initialize when newly created.
                # Every time a remote sub-environment crashes, a new env is created
                # in its place and will take this long (sleep) to "initialize".
                init_time_s: 1.0
        num_workers: 2
        num_envs_per_worker: 5

        # Disable env checking. Env checker doesn't handle Exceptions from
        # user envs, and will crash rollout worker.
        disable_env_checking: true

        # Switch on resiliency for failed sub environments (within a vectorized stack).
        restart_failed_sub_environments: true

        evaluation_num_workers: 2
        evaluation_interval: 1
        evaluation_duration: 20
        evaluation_duration_unit: episodes
        evaluation_parallel_to_training: true
        evaluation_config:
            explore: false
            env_config:
                config:
                    # Make eval workers solid.
                    # This test is to prove that we can learn with crashing env,
                    # not eval with crashing env.
                    p_crash: 0.0
                    p_crash_reset: 0.0
                    init_time_s: 0.0