123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- """
- Example of specifying an autoregressive action distribution.
- In an action space with multiple components (e.g., Tuple(a1, a2)), you might
- want a2 to be sampled based on the sampled value of a1, i.e.,
- a2_sampled ~ P(a2 | a1_sampled, obs). Normally, a1 and a2 would be sampled
- independently.
- To do this, you need both a custom model that implements the autoregressive
- pattern, and a custom action distribution class that leverages that model.
- This examples shows both.
- Related paper: https://arxiv.org/abs/1903.11524
- The example uses the CorrelatedActionsEnv where the agent observes a random
- number (0 or 1) and has to choose two actions a1 and a2.
- Action a1 should match the observation (+5 reward) and a2 should match a1
- (+5 reward).
- Since a2 should depend on a1, an autoregressive action dist makes sense.
- ---
- To better understand the environment, run 1 manual train iteration and test
- loop without Tune:
- $ python autoregressive_action_dist.py --stop-iters 1 --no-tune
- Run this example with defaults (using Tune and autoregressive action dist):
- $ python autoregressive_action_dist.py
- Then run again without autoregressive actions:
- $ python autoregressive_action_dist.py --no-autoreg
- # TODO: Why does this lead to better results than autoregressive actions?
- Compare learning curve on TensorBoard:
- $ cd ~/ray-results/; tensorboard --logdir .
- Other options for running this example:
- $ python attention_net.py --help
- """
- import argparse
- import os
- import ray
- from ray import tune
- from ray.rllib.agents import ppo
- from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv
- from ray.rllib.examples.models.autoregressive_action_model import \
- AutoregressiveActionModel, TorchAutoregressiveActionModel
- from ray.rllib.examples.models.autoregressive_action_dist import \
- BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution
- from ray.rllib.models import ModelCatalog
- from ray.rllib.utils.test_utils import check_learning_achieved
- from ray.tune.logger import pretty_print
- def get_cli_args():
- """Create CLI parser and return parsed arguments"""
- parser = argparse.ArgumentParser()
- # example-specific arg: disable autoregressive action dist
- parser.add_argument(
- "--no-autoreg",
- action="store_true",
- help="Do NOT use an autoregressive action distribution but normal,"
- "independently distributed actions.")
- # general args
- parser.add_argument(
- "--run",
- type=str,
- default="PPO",
- help="The RLlib-registered algorithm to use.")
- parser.add_argument(
- "--framework",
- choices=["tf", "tf2", "tfe", "torch"],
- default="tf",
- help="The DL framework specifier.")
- parser.add_argument("--num-cpus", type=int, default=0)
- parser.add_argument(
- "--as-test",
- action="store_true",
- help="Whether this script should be run as a test: --stop-reward must "
- "be achieved within --stop-timesteps AND --stop-iters.")
- parser.add_argument(
- "--stop-iters",
- type=int,
- default=200,
- help="Number of iterations to train.")
- parser.add_argument(
- "--stop-timesteps",
- type=int,
- default=100000,
- help="Number of timesteps to train.")
- parser.add_argument(
- "--stop-reward",
- type=float,
- default=200.0,
- help="Reward at which we stop training.")
- parser.add_argument(
- "--no-tune",
- action="store_true",
- help="Run without Tune using a manual train loop instead. Here,"
- "there is no TensorBoard support.")
- parser.add_argument(
- "--local-mode",
- action="store_true",
- help="Init Ray in local mode for easier debugging.")
- args = parser.parse_args()
- print(f"Running with following CLI args: {args}")
- return args
- if __name__ == "__main__":
- args = get_cli_args()
- ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)
- # main part: register and configure autoregressive action model and dist
- # here, tailored to the CorrelatedActionsEnv such that a2 depends on a1
- ModelCatalog.register_custom_model(
- "autoregressive_model", TorchAutoregressiveActionModel
- if args.framework == "torch" else AutoregressiveActionModel)
- ModelCatalog.register_custom_action_dist(
- "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
- if args.framework == "torch" else BinaryAutoregressiveDistribution)
- # standard config
- config = {
- "env": CorrelatedActionsEnv,
- "gamma": 0.5,
- # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
- "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
- "framework": args.framework,
- }
- # use registered model and dist in config
- if not args.no_autoreg:
- config["model"] = {
- "custom_model": "autoregressive_model",
- "custom_action_dist": "binary_autoreg_dist",
- }
- # use stop conditions passed via CLI (or defaults)
- stop = {
- "training_iteration": args.stop_iters,
- "timesteps_total": args.stop_timesteps,
- "episode_reward_mean": args.stop_reward,
- }
- # manual training loop using PPO without tune.run()
- if args.no_tune:
- if args.run != "PPO":
- raise ValueError("Only support --run PPO with --no-tune.")
- ppo_config = ppo.DEFAULT_CONFIG.copy()
- ppo_config.update(config)
- trainer = ppo.PPOTrainer(config=ppo_config, env=CorrelatedActionsEnv)
- # run manual training loop and print results after each iteration
- for _ in range(args.stop_iters):
- result = trainer.train()
- print(pretty_print(result))
- # stop training if the target train steps or reward are reached
- if result["timesteps_total"] >= args.stop_timesteps or \
- result["episode_reward_mean"] >= args.stop_reward:
- break
- # run manual test loop: 1 iteration until done
- print("Finished training. Running manual test/inference loop.")
- env = CorrelatedActionsEnv(_)
- obs = env.reset()
- done = False
- total_reward = 0
- while not done:
- a1, a2 = trainer.compute_single_action(obs)
- next_obs, reward, done, _ = env.step((a1, a2))
- print(f"Obs: {obs}, Action: a1={a1} a2={a2}, Reward: {reward}")
- obs = next_obs
- total_reward += reward
- print(f"Total reward in test episode: {total_reward}")
- # run with Tune for auto env and trainer creation and TensorBoard
- else:
- results = tune.run(args.run, stop=stop, config=config, verbose=2)
- if args.as_test:
- print("Checking if learning goals were achieved")
- check_learning_achieved(results, args.stop_reward)
- ray.shutdown()
|