custom_eval.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. """Example of customizing evaluation with RLlib.
  2. Pass --custom-eval to run with a custom evaluation function too.
  3. Here we define a custom evaluation method that runs a specific sweep of env
  4. parameters (SimpleCorridor corridor lengths).
  5. ------------------------------------------------------------------------
  6. Sample output for `python custom_eval.py`
  7. ------------------------------------------------------------------------
  8. INFO trainer.py:623 -- Evaluating current policy for 10 episodes.
  9. INFO trainer.py:650 -- Running round 0 of parallel evaluation (2/10 episodes)
  10. INFO trainer.py:650 -- Running round 1 of parallel evaluation (4/10 episodes)
  11. INFO trainer.py:650 -- Running round 2 of parallel evaluation (6/10 episodes)
  12. INFO trainer.py:650 -- Running round 3 of parallel evaluation (8/10 episodes)
  13. INFO trainer.py:650 -- Running round 4 of parallel evaluation (10/10 episodes)
  14. Result for PG_SimpleCorridor_2c6b27dc:
  15. ...
  16. evaluation:
  17. custom_metrics: {}
  18. episode_len_mean: 15.864661654135338
  19. episode_reward_max: 1.0
  20. episode_reward_mean: 0.49624060150375937
  21. episode_reward_min: 0.0
  22. episodes_this_iter: 133
  23. off_policy_estimator: {}
  24. policy_reward_max: {}
  25. policy_reward_mean: {}
  26. policy_reward_min: {}
  27. sampler_perf:
  28. mean_env_wait_ms: 0.0362923321333299
  29. mean_inference_ms: 0.6319202064080927
  30. mean_processing_ms: 0.14143652169068222
  31. ------------------------------------------------------------------------
  32. Sample output for `python custom_eval.py --custom-eval`
  33. ------------------------------------------------------------------------
  34. INFO trainer.py:631 -- Running custom eval function <function ...>
  35. Update corridor length to 4
  36. Update corridor length to 7
  37. Custom evaluation round 1
  38. Custom evaluation round 2
  39. Custom evaluation round 3
  40. Custom evaluation round 4
  41. Result for PG_SimpleCorridor_0de4e686:
  42. ...
  43. evaluation:
  44. custom_metrics: {}
  45. episode_len_mean: 9.15695067264574
  46. episode_reward_max: 1.0
  47. episode_reward_mean: 0.9596412556053812
  48. episode_reward_min: 0.0
  49. episodes_this_iter: 223
  50. foo: 1
  51. off_policy_estimator: {}
  52. policy_reward_max: {}
  53. policy_reward_mean: {}
  54. policy_reward_min: {}
  55. sampler_perf:
  56. mean_env_wait_ms: 0.03423667269562796
  57. mean_inference_ms: 0.5654563161491506
  58. mean_processing_ms: 0.14494765630060774
  59. """
  60. import argparse
  61. import os
  62. import ray
  63. from ray import tune
  64. from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes
  65. from ray.rllib.examples.env.simple_corridor import SimpleCorridor
  66. from ray.rllib.utils.test_utils import check_learning_achieved
  67. parser = argparse.ArgumentParser()
  68. parser.add_argument("--num-cpus", type=int, default=0)
  69. parser.add_argument(
  70. "--framework",
  71. choices=["tf", "tf2", "tfe", "torch"],
  72. default="tf",
  73. help="The DL framework specifier.")
  74. parser.add_argument("--no-custom-eval", action="store_true")
  75. parser.add_argument(
  76. "--as-test",
  77. action="store_true",
  78. help="Whether this script should be run as a test: --stop-reward must "
  79. "be achieved within --stop-timesteps AND --stop-iters.")
  80. parser.add_argument(
  81. "--stop-iters",
  82. type=int,
  83. default=50,
  84. help="Number of iterations to train.")
  85. parser.add_argument(
  86. "--stop-timesteps",
  87. type=int,
  88. default=20000,
  89. help="Number of timesteps to train.")
  90. parser.add_argument(
  91. "--stop-reward",
  92. type=float,
  93. default=0.7,
  94. help="Reward at which we stop training.")
  95. def custom_eval_function(trainer, eval_workers):
  96. """Example of a custom evaluation function.
  97. Args:
  98. trainer (Trainer): trainer class to evaluate.
  99. eval_workers (WorkerSet): evaluation workers.
  100. Returns:
  101. metrics (dict): evaluation metrics dict.
  102. """
  103. # We configured 2 eval workers in the training config.
  104. worker_1, worker_2 = eval_workers.remote_workers()
  105. # Set different env settings for each worker. Here we use a fixed config,
  106. # which also could have been computed in each worker by looking at
  107. # env_config.worker_index (printed in SimpleCorridor class above).
  108. worker_1.foreach_env.remote(lambda env: env.set_corridor_length(4))
  109. worker_2.foreach_env.remote(lambda env: env.set_corridor_length(7))
  110. for i in range(5):
  111. print("Custom evaluation round", i)
  112. # Calling .sample() runs exactly one episode per worker due to how the
  113. # eval workers are configured.
  114. ray.get([w.sample.remote() for w in eval_workers.remote_workers()])
  115. # Collect the accumulated episodes on the workers, and then summarize the
  116. # episode stats into a metrics dict.
  117. episodes, _ = collect_episodes(
  118. remote_workers=eval_workers.remote_workers(), timeout_seconds=99999)
  119. # You can compute metrics from the episodes manually, or use the
  120. # convenient `summarize_episodes()` utility:
  121. metrics = summarize_episodes(episodes)
  122. # Note that the above two statements are the equivalent of:
  123. # metrics = collect_metrics(eval_workers.local_worker(),
  124. # eval_workers.remote_workers())
  125. # You can also put custom values in the metrics dict.
  126. metrics["foo"] = 1
  127. return metrics
  128. if __name__ == "__main__":
  129. args = parser.parse_args()
  130. if args.no_custom_eval:
  131. eval_fn = None
  132. else:
  133. eval_fn = custom_eval_function
  134. ray.init(num_cpus=args.num_cpus or None)
  135. config = {
  136. "env": SimpleCorridor,
  137. "env_config": {
  138. "corridor_length": 10,
  139. },
  140. "horizon": 20,
  141. # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
  142. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
  143. # Training rollouts will be collected using just the learner
  144. # process, but evaluation will be done in parallel with two
  145. # workers. Hence, this run will use 3 CPUs total (1 for the
  146. # learner + 2 more for evaluation workers).
  147. "num_workers": 0,
  148. "evaluation_num_workers": 2,
  149. # Optional custom eval function.
  150. "custom_eval_function": eval_fn,
  151. # Enable evaluation, once per training iteration.
  152. "evaluation_interval": 1,
  153. # Run 10 episodes each time evaluation runs.
  154. "evaluation_duration": 10,
  155. # Override the env config for evaluation.
  156. "evaluation_config": {
  157. "env_config": {
  158. # Evaluate using LONGER corridor than trained on.
  159. "corridor_length": 5,
  160. },
  161. },
  162. "framework": args.framework,
  163. }
  164. stop = {
  165. "training_iteration": args.stop_iters,
  166. "timesteps_total": args.stop_timesteps,
  167. "episode_reward_mean": args.stop_reward,
  168. }
  169. results = tune.run("PG", config=config, stop=stop, verbose=1)
  170. # Check eval results (from eval workers using the custom function),
  171. # not results from the regular workers.
  172. if args.as_test:
  173. check_learning_achieved(results, args.stop_reward, evaluation=True)
  174. ray.shutdown()