""" Example of specifying an autoregressive action distribution. In an action space with multiple components (e.g., Tuple(a1, a2)), you might want a2 to be sampled based on the sampled value of a1, i.e., a2_sampled ~ P(a2 | a1_sampled, obs). Normally, a1 and a2 would be sampled independently. To do this, you need both a custom model that implements the autoregressive pattern, and a custom action distribution class that leverages that model. This examples shows both. Related paper: https://arxiv.org/abs/1903.11524 The example uses the CorrelatedActionsEnv where the agent observes a random number (0 or 1) and has to choose two actions a1 and a2. Action a1 should match the observation (+5 reward) and a2 should match a1 (+5 reward). Since a2 should depend on a1, an autoregressive action dist makes sense. --- To better understand the environment, run 1 manual train iteration and test loop without Tune: $ python autoregressive_action_dist.py --stop-iters 1 --no-tune Run this example with defaults (using Tune and autoregressive action dist): $ python autoregressive_action_dist.py Then run again without autoregressive actions: $ python autoregressive_action_dist.py --no-autoreg # TODO: Why does this lead to better results than autoregressive actions? Compare learning curve on TensorBoard: $ cd ~/ray-results/; tensorboard --logdir . Other options for running this example: $ python attention_net.py --help """ import argparse import os import ray from ray import tune from ray.rllib.agents import ppo from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv from ray.rllib.examples.models.autoregressive_action_model import \ AutoregressiveActionModel, TorchAutoregressiveActionModel from ray.rllib.examples.models.autoregressive_action_dist import \ BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution from ray.rllib.models import ModelCatalog from ray.rllib.utils.test_utils import check_learning_achieved from ray.tune.logger import pretty_print def get_cli_args(): """Create CLI parser and return parsed arguments""" parser = argparse.ArgumentParser() # example-specific arg: disable autoregressive action dist parser.add_argument( "--no-autoreg", action="store_true", help="Do NOT use an autoregressive action distribution but normal," "independently distributed actions.") # general args parser.add_argument( "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use.") parser.add_argument( "--framework", choices=["tf", "tf2", "tfe", "torch"], default="tf", help="The DL framework specifier.") parser.add_argument("--num-cpus", type=int, default=0) parser.add_argument( "--as-test", action="store_true", help="Whether this script should be run as a test: --stop-reward must " "be achieved within --stop-timesteps AND --stop-iters.") parser.add_argument( "--stop-iters", type=int, default=200, help="Number of iterations to train.") parser.add_argument( "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train.") parser.add_argument( "--stop-reward", type=float, default=200.0, help="Reward at which we stop training.") parser.add_argument( "--no-tune", action="store_true", help="Run without Tune using a manual train loop instead. Here," "there is no TensorBoard support.") parser.add_argument( "--local-mode", action="store_true", help="Init Ray in local mode for easier debugging.") args = parser.parse_args() print(f"Running with following CLI args: {args}") return args if __name__ == "__main__": args = get_cli_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) # main part: register and configure autoregressive action model and dist # here, tailored to the CorrelatedActionsEnv such that a2 depends on a1 ModelCatalog.register_custom_model( "autoregressive_model", TorchAutoregressiveActionModel if args.framework == "torch" else AutoregressiveActionModel) ModelCatalog.register_custom_action_dist( "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution if args.framework == "torch" else BinaryAutoregressiveDistribution) # standard config config = { "env": CorrelatedActionsEnv, "gamma": 0.5, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, } # use registered model and dist in config if not args.no_autoreg: config["model"] = { "custom_model": "autoregressive_model", "custom_action_dist": "binary_autoreg_dist", } # use stop conditions passed via CLI (or defaults) stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } # manual training loop using PPO without tune.run() if args.no_tune: if args.run != "PPO": raise ValueError("Only support --run PPO with --no-tune.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) trainer = ppo.PPOTrainer(config=ppo_config, env=CorrelatedActionsEnv) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() print(pretty_print(result)) # stop training if the target train steps or reward are reached if result["timesteps_total"] >= args.stop_timesteps or \ result["episode_reward_mean"] >= args.stop_reward: break # run manual test loop: 1 iteration until done print("Finished training. Running manual test/inference loop.") env = CorrelatedActionsEnv(_) obs = env.reset() done = False total_reward = 0 while not done: a1, a2 = trainer.compute_single_action(obs) next_obs, reward, done, _ = env.step((a1, a2)) print(f"Obs: {obs}, Action: a1={a1} a2={a2}, Reward: {reward}") obs = next_obs total_reward += reward print(f"Total reward in test episode: {total_reward}") # run with Tune for auto env and trainer creation and TensorBoard else: results = tune.run(args.run, stop=stop, config=config, verbose=2) if args.as_test: print("Checking if learning goals were achieved") check_learning_achieved(results, args.stop_reward) ray.shutdown()