123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- """
- Example of a curriculum learning setup using the `TaskSettableEnv` API
- and the env_task_fn config.
- This example shows:
- - Writing your own curriculum-capable environment using gym.Env.
- - Defining a env_task_fn that determines, whether and which new task
- the env(s) should be set to (using the TaskSettableEnv API).
- - Using Tune and RLlib to curriculum-learn this env.
- You can visualize experiment results in ~/ray_results using TensorBoard.
- """
- import argparse
- import numpy as np
- import os
- import ray
- from ray import tune
- from ray.rllib.env.apis.task_settable_env import TaskSettableEnv, TaskType
- from ray.rllib.env.env_context import EnvContext
- from ray.rllib.examples.env.curriculum_capable_env import CurriculumCapableEnv
- from ray.rllib.utils.framework import try_import_tf, try_import_torch
- from ray.rllib.utils.test_utils import check_learning_achieved
- tf1, tf, tfv = try_import_tf()
- torch, nn = try_import_torch()
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--run",
- type=str,
- default="PPO",
- help="The RLlib-registered algorithm to use.")
- parser.add_argument(
- "--framework",
- choices=["tf", "tf2", "tfe", "torch"],
- default="tf",
- help="The DL framework specifier.")
- parser.add_argument(
- "--as-test",
- action="store_true",
- help="Whether this script should be run as a test: --stop-reward must "
- "be achieved within --stop-timesteps AND --stop-iters.")
- parser.add_argument(
- "--stop-iters",
- type=int,
- default=50,
- help="Number of iterations to train.")
- parser.add_argument(
- "--stop-timesteps",
- type=int,
- default=200000,
- help="Number of timesteps to train.")
- parser.add_argument(
- "--stop-reward",
- type=float,
- default=10000.0,
- help="Reward at which we stop training.")
- def curriculum_fn(train_results: dict, task_settable_env: TaskSettableEnv,
- env_ctx: EnvContext) -> TaskType:
- """Function returning a possibly new task to set `task_settable_env` to.
- Args:
- train_results (dict): The train results returned by Trainer.train().
- task_settable_env (TaskSettableEnv): A single TaskSettableEnv object
- used inside any worker and at any vector position. Use `env_ctx`
- to get the worker_index, vector_index, and num_workers.
- env_ctx (EnvContext): The env context object (i.e. env's config dict
- plus properties worker_index, vector_index and num_workers) used
- to setup the `task_settable_env`.
- Returns:
- TaskType: The task to set the env to. This may be the same as the
- current one.
- """
- # Our env supports tasks 1 (default) to 5.
- # With each task, rewards get scaled up by a factor of 10, such that:
- # Level 1: Expect rewards between 0.0 and 1.0.
- # Level 2: Expect rewards between 1.0 and 10.0, etc..
- # We will thus raise the level/task each time we hit a new power of 10.0
- new_task = int(np.log10(train_results["episode_reward_mean"]) + 2.1)
- # Clamp between valid values, just in case:
- new_task = max(min(new_task, 5), 1)
- print(f"Worker #{env_ctx.worker_index} vec-idx={env_ctx.vector_index}"
- f"\nR={train_results['episode_reward_mean']}"
- f"\nSetting env to task={new_task}")
- return new_task
- if __name__ == "__main__":
- args = parser.parse_args()
- ray.init()
- # Can also register the env creator function explicitly with:
- # register_env(
- # "curriculum_env", lambda config: CurriculumCapableEnv(config))
- config = {
- "env": CurriculumCapableEnv, # or "curriculum_env" if registered above
- "env_config": {
- "start_level": 1,
- },
- "num_workers": 2, # parallelism
- "num_envs_per_worker": 5,
- "env_task_fn": curriculum_fn,
- # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
- "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
- "framework": args.framework,
- }
- stop = {
- "training_iteration": args.stop_iters,
- "timesteps_total": args.stop_timesteps,
- "episode_reward_mean": args.stop_reward,
- }
- results = tune.run(args.run, config=config, stop=stop, verbose=2)
- if args.as_test:
- check_learning_achieved(results, args.stop_reward)
- ray.shutdown()
|