autoregressive_action_dist.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. """
  2. Example of specifying an autoregressive action distribution.
  3. In an action space with multiple components (e.g., Tuple(a1, a2)), you might
  4. want a2 to be sampled based on the sampled value of a1, i.e.,
  5. a2_sampled ~ P(a2 | a1_sampled, obs). Normally, a1 and a2 would be sampled
  6. independently.
  7. To do this, you need both a custom model that implements the autoregressive
  8. pattern, and a custom action distribution class that leverages that model.
  9. This examples shows both.
  10. Related paper: https://arxiv.org/abs/1903.11524
  11. The example uses the CorrelatedActionsEnv where the agent observes a random
  12. number (0 or 1) and has to choose two actions a1 and a2.
  13. Action a1 should match the observation (+5 reward) and a2 should match a1
  14. (+5 reward).
  15. Since a2 should depend on a1, an autoregressive action dist makes sense.
  16. ---
  17. To better understand the environment, run 1 manual train iteration and test
  18. loop without Tune:
  19. $ python autoregressive_action_dist.py --stop-iters 1 --no-tune
  20. Run this example with defaults (using Tune and autoregressive action dist):
  21. $ python autoregressive_action_dist.py
  22. Then run again without autoregressive actions:
  23. $ python autoregressive_action_dist.py --no-autoreg
  24. # TODO: Why does this lead to better results than autoregressive actions?
  25. Compare learning curve on TensorBoard:
  26. $ cd ~/ray-results/; tensorboard --logdir .
  27. Other options for running this example:
  28. $ python attention_net.py --help
  29. """
  30. import argparse
  31. import os
  32. import ray
  33. from ray import tune
  34. from ray.rllib.agents import ppo
  35. from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv
  36. from ray.rllib.examples.models.autoregressive_action_model import \
  37. AutoregressiveActionModel, TorchAutoregressiveActionModel
  38. from ray.rllib.examples.models.autoregressive_action_dist import \
  39. BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution
  40. from ray.rllib.models import ModelCatalog
  41. from ray.rllib.utils.test_utils import check_learning_achieved
  42. from ray.tune.logger import pretty_print
  43. def get_cli_args():
  44. """Create CLI parser and return parsed arguments"""
  45. parser = argparse.ArgumentParser()
  46. # example-specific arg: disable autoregressive action dist
  47. parser.add_argument(
  48. "--no-autoreg",
  49. action="store_true",
  50. help="Do NOT use an autoregressive action distribution but normal,"
  51. "independently distributed actions.")
  52. # general args
  53. parser.add_argument(
  54. "--run",
  55. type=str,
  56. default="PPO",
  57. help="The RLlib-registered algorithm to use.")
  58. parser.add_argument(
  59. "--framework",
  60. choices=["tf", "tf2", "tfe", "torch"],
  61. default="tf",
  62. help="The DL framework specifier.")
  63. parser.add_argument("--num-cpus", type=int, default=0)
  64. parser.add_argument(
  65. "--as-test",
  66. action="store_true",
  67. help="Whether this script should be run as a test: --stop-reward must "
  68. "be achieved within --stop-timesteps AND --stop-iters.")
  69. parser.add_argument(
  70. "--stop-iters",
  71. type=int,
  72. default=200,
  73. help="Number of iterations to train.")
  74. parser.add_argument(
  75. "--stop-timesteps",
  76. type=int,
  77. default=100000,
  78. help="Number of timesteps to train.")
  79. parser.add_argument(
  80. "--stop-reward",
  81. type=float,
  82. default=200.0,
  83. help="Reward at which we stop training.")
  84. parser.add_argument(
  85. "--no-tune",
  86. action="store_true",
  87. help="Run without Tune using a manual train loop instead. Here,"
  88. "there is no TensorBoard support.")
  89. parser.add_argument(
  90. "--local-mode",
  91. action="store_true",
  92. help="Init Ray in local mode for easier debugging.")
  93. args = parser.parse_args()
  94. print(f"Running with following CLI args: {args}")
  95. return args
  96. if __name__ == "__main__":
  97. args = get_cli_args()
  98. ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)
  99. # main part: register and configure autoregressive action model and dist
  100. # here, tailored to the CorrelatedActionsEnv such that a2 depends on a1
  101. ModelCatalog.register_custom_model(
  102. "autoregressive_model", TorchAutoregressiveActionModel
  103. if args.framework == "torch" else AutoregressiveActionModel)
  104. ModelCatalog.register_custom_action_dist(
  105. "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
  106. if args.framework == "torch" else BinaryAutoregressiveDistribution)
  107. # standard config
  108. config = {
  109. "env": CorrelatedActionsEnv,
  110. "gamma": 0.5,
  111. # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
  112. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
  113. "framework": args.framework,
  114. }
  115. # use registered model and dist in config
  116. if not args.no_autoreg:
  117. config["model"] = {
  118. "custom_model": "autoregressive_model",
  119. "custom_action_dist": "binary_autoreg_dist",
  120. }
  121. # use stop conditions passed via CLI (or defaults)
  122. stop = {
  123. "training_iteration": args.stop_iters,
  124. "timesteps_total": args.stop_timesteps,
  125. "episode_reward_mean": args.stop_reward,
  126. }
  127. # manual training loop using PPO without tune.run()
  128. if args.no_tune:
  129. if args.run != "PPO":
  130. raise ValueError("Only support --run PPO with --no-tune.")
  131. ppo_config = ppo.DEFAULT_CONFIG.copy()
  132. ppo_config.update(config)
  133. trainer = ppo.PPOTrainer(config=ppo_config, env=CorrelatedActionsEnv)
  134. # run manual training loop and print results after each iteration
  135. for _ in range(args.stop_iters):
  136. result = trainer.train()
  137. print(pretty_print(result))
  138. # stop training if the target train steps or reward are reached
  139. if result["timesteps_total"] >= args.stop_timesteps or \
  140. result["episode_reward_mean"] >= args.stop_reward:
  141. break
  142. # run manual test loop: 1 iteration until done
  143. print("Finished training. Running manual test/inference loop.")
  144. env = CorrelatedActionsEnv(_)
  145. obs = env.reset()
  146. done = False
  147. total_reward = 0
  148. while not done:
  149. a1, a2 = trainer.compute_single_action(obs)
  150. next_obs, reward, done, _ = env.step((a1, a2))
  151. print(f"Obs: {obs}, Action: a1={a1} a2={a2}, Reward: {reward}")
  152. obs = next_obs
  153. total_reward += reward
  154. print(f"Total reward in test episode: {total_reward}")
  155. # run with Tune for auto env and trainer creation and TensorBoard
  156. else:
  157. results = tune.run(args.run, stop=stop, config=config, verbose=2)
  158. if args.as_test:
  159. print("Checking if learning goals were achieved")
  160. check_learning_achieved(results, args.stop_reward)
  161. ray.shutdown()