run_regression_tests.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #!/usr/bin/env python
  2. # Runs one or more regression tests. Retries tests up to 3 times.
  3. #
  4. # Example usage:
  5. # $ python run_regression_tests.py regression-tests/cartpole-es-[tf|torch].yaml
  6. #
  7. # When using in BAZEL (with py_test), e.g. see in ray/rllib/BUILD:
  8. # py_test(
  9. # name = "run_regression_tests",
  10. # main = "tests/run_regression_tests.py",
  11. # tags = ["learning_tests"],
  12. # size = "medium", # 5min timeout
  13. # srcs = ["tests/run_regression_tests.py"],
  14. # data = glob(["tuned_examples/regression_tests/*.yaml"]),
  15. # # Pass `BAZEL` option and the path to look for yaml regression files.
  16. # args = ["BAZEL", "tuned_examples/regression_tests"]
  17. # )
  18. import argparse
  19. import os
  20. from pathlib import Path
  21. import sys
  22. import re
  23. import yaml
  24. import ray
  25. from ray import air
  26. from ray.air.integrations.wandb import WandbLoggerCallback
  27. from ray.rllib import _register_all
  28. from ray.rllib.common import SupportedFileType
  29. from ray.rllib.train import load_experiments_from_file
  30. from ray.rllib.utils.deprecation import deprecation_warning
  31. from ray.rllib.utils.metrics import (
  32. ENV_RUNNER_RESULTS,
  33. EPISODE_RETURN_MEAN,
  34. EVALUATION_RESULTS,
  35. )
  36. from ray.tune import run_experiments
  37. parser = argparse.ArgumentParser()
  38. parser.add_argument(
  39. "--framework",
  40. type=str,
  41. choices=["torch", "tf2", "tf"],
  42. default=None,
  43. help="The deep learning framework to use. If not provided, try using the one "
  44. "specified in the file, otherwise, use RLlib's default: `torch`.",
  45. )
  46. parser.add_argument(
  47. "--dir",
  48. type=str,
  49. required=True,
  50. help="The directory or file in which to find all tests.",
  51. )
  52. parser.add_argument(
  53. "--env",
  54. type=str,
  55. default=None,
  56. help="An optional env override setting. If not provided, try using the one "
  57. "specified in the file.",
  58. )
  59. parser.add_argument("--num-cpus", type=int, default=None)
  60. parser.add_argument(
  61. "--local-mode",
  62. action="store_true",
  63. help="Run ray in local mode for easier debugging.",
  64. )
  65. parser.add_argument(
  66. "--num-samples",
  67. type=int,
  68. default=1,
  69. help="The number of seeds/samples to run with the given experiment config.",
  70. )
  71. parser.add_argument(
  72. "--override-mean-reward",
  73. type=float,
  74. default=0.0,
  75. help=(
  76. "Override the mean reward specified by the yaml file in the stopping criteria. "
  77. "This is particularly useful for timed tests."
  78. ),
  79. )
  80. parser.add_argument(
  81. "--verbose",
  82. type=int,
  83. default=2,
  84. help="The verbosity level for the main `tune.run_experiments()` call.",
  85. )
  86. parser.add_argument(
  87. "--wandb-key",
  88. type=str,
  89. default=None,
  90. help="The WandB API key to use for uploading results.",
  91. )
  92. parser.add_argument(
  93. "--wandb-project",
  94. type=str,
  95. default=None,
  96. help="The WandB project name to use.",
  97. )
  98. parser.add_argument(
  99. "--wandb-run-name",
  100. type=str,
  101. default=None,
  102. help="The WandB run name to use.",
  103. )
  104. # parser.add_argument(
  105. # "--wandb-from-checkpoint",
  106. # type=str,
  107. # default=None,
  108. # help=(
  109. # "The WandB checkpoint location (e.g. `[team name]/[project name]/checkpoint_"
  110. # "[run name]:v[version]`) from which to resume an experiment."
  111. # ),
  112. # )
  113. parser.add_argument(
  114. "--checkpoint-freq",
  115. type=int,
  116. default=0,
  117. help=(
  118. "The frequency (in training iterations) with which to create checkpoints. "
  119. "Note that if --wandb-key is provided, these checkpoints will automatically "
  120. "be uploaded to WandB."
  121. ),
  122. )
  123. # Obsoleted arg, use --dir instead.
  124. parser.add_argument("--yaml-dir", type=str, default="")
  125. if __name__ == "__main__":
  126. args = parser.parse_args()
  127. if args.yaml_dir != "":
  128. deprecation_warning(old="--yaml-dir", new="--dir", error=True)
  129. # Bazel regression test mode: Get path to look for yaml files.
  130. # Get the path or single file to use.
  131. rllib_dir = Path(__file__).parent.parent
  132. print(f"rllib dir={rllib_dir}")
  133. abs_path = os.path.join(rllib_dir, args.dir)
  134. # Single file given.
  135. if os.path.isfile(abs_path):
  136. files = [abs_path]
  137. # Path given -> Get all yaml files in there via rglob.
  138. elif os.path.isdir(abs_path):
  139. files = []
  140. for type_ in ["yaml", "yml", "py"]:
  141. files += list(rllib_dir.rglob(args.dir + f"/*.{type_}"))
  142. files = sorted(map(lambda path: str(path.absolute()), files), reverse=True)
  143. # Given path/file does not exist.
  144. else:
  145. raise ValueError(f"--dir ({args.dir}) not found!")
  146. print("Will run the following regression tests:")
  147. for file in files:
  148. print("->", file)
  149. # Loop through all collected files.
  150. for file in files:
  151. config_is_python = False
  152. # For python files, need to make sure, we only deliver the module name into the
  153. # `load_experiments_from_file` function (everything from "/ray/rllib" on).
  154. if file.endswith(".py"):
  155. if file.endswith("__init__.py"): # weird CI learning test (BAZEL) case
  156. continue
  157. experiments = load_experiments_from_file(file, SupportedFileType.python)
  158. config_is_python = True
  159. else:
  160. experiments = load_experiments_from_file(file, SupportedFileType.yaml)
  161. assert (
  162. len(experiments) == 1
  163. ), "Error, can only run a single experiment per file!"
  164. exp = list(experiments.values())[0]
  165. exp_name = list(experiments.keys())[0]
  166. # Set the number of samples to run.
  167. exp["num_samples"] = args.num_samples
  168. # Make sure there is a config and a stopping criterium.
  169. exp["config"] = exp.get("config", {})
  170. exp["stop"] = exp.get("stop", {})
  171. # Override framework setting with the command line one, if provided.
  172. # Otherwise, will use framework setting in file (or default: torch).
  173. if args.framework is not None:
  174. exp["config"]["framework"] = args.framework
  175. # Override env setting if given on command line.
  176. if args.env is not None:
  177. exp["config"]["env"] = args.env
  178. else:
  179. exp["config"]["env"] = exp["env"]
  180. # Override the mean reward if specified. This is used by the ray ci
  181. # for overriding the episode reward mean for tf2 tests for off policy
  182. # long learning tests such as sac and ddpg on the pendulum environment.
  183. if args.override_mean_reward != 0.0:
  184. exp["stop"][
  185. f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
  186. ] = args.override_mean_reward
  187. # Checkpoint settings.
  188. exp["checkpoint_config"] = air.CheckpointConfig(
  189. checkpoint_frequency=args.checkpoint_freq,
  190. checkpoint_at_end=args.checkpoint_freq > 0,
  191. )
  192. # Always run with eager-tracing when framework=tf2, if not in local-mode
  193. # and unless the yaml explicitly tells us to disable eager tracing.
  194. if (
  195. (args.framework == "tf2" or exp["config"].get("framework") == "tf2")
  196. and not args.local_mode
  197. # Note: This check will always fail for python configs, b/c normally,
  198. # algorithm configs have `self.eager_tracing=False` by default.
  199. # Thus, you'd have to set `eager_tracing` to True explicitly in your python
  200. # config to make sure we are indeed using eager tracing.
  201. and exp["config"].get("eager_tracing") is not False
  202. ):
  203. exp["config"]["eager_tracing"] = True
  204. # Print out the actual config (not for py files as yaml.dump weirdly fails).
  205. if not config_is_python:
  206. print("== Test config ==")
  207. print(yaml.dump(experiments))
  208. callbacks = None
  209. if args.wandb_key is not None:
  210. project = args.wandb_project or (
  211. exp["run"].lower()
  212. + "-"
  213. + re.sub("\\W+", "-", exp["config"]["env"].lower())
  214. if config_is_python
  215. else list(experiments.keys())[0]
  216. )
  217. callbacks = [
  218. WandbLoggerCallback(
  219. api_key=args.wandb_key,
  220. project=project,
  221. upload_checkpoints=True,
  222. **({"name": args.wandb_run_name} if args.wandb_run_name else {}),
  223. )
  224. ]
  225. # Try running each test 3 times and make sure it reaches the given
  226. # reward.
  227. passed = False
  228. for i in range(3):
  229. # Try starting a new ray cluster.
  230. try:
  231. ray.init(num_cpus=args.num_cpus, local_mode=args.local_mode)
  232. # Allow running this script on existing cluster as well.
  233. except ConnectionError:
  234. ray.init()
  235. else:
  236. try:
  237. trials = run_experiments(
  238. experiments,
  239. resume=False,
  240. verbose=args.verbose,
  241. callbacks=callbacks,
  242. )
  243. finally:
  244. ray.shutdown()
  245. _register_all()
  246. for t in trials:
  247. # If we have evaluation workers, use their rewards.
  248. # This is useful for offline learning tests, where
  249. # we evaluate against an actual environment.
  250. check_eval = bool(exp["config"].get("evaluation_interval"))
  251. reward_mean = (
  252. t.last_result[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][
  253. EPISODE_RETURN_MEAN
  254. ]
  255. if check_eval
  256. else (
  257. # Some algos don't store sampler results under `env_runners`
  258. # e.g. ARS. Need to keep this logic around for now.
  259. t.last_result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
  260. if ENV_RUNNER_RESULTS in t.last_result
  261. else t.last_result[EPISODE_RETURN_MEAN]
  262. )
  263. )
  264. # If we are using evaluation workers, we may have
  265. # a stopping criterion under the "evaluation/" scope. If
  266. # not, use `episode_return_mean`.
  267. if check_eval:
  268. min_reward = t.stopping_criterion.get(
  269. f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/"
  270. f"{EPISODE_RETURN_MEAN}",
  271. t.stopping_criterion.get(
  272. f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
  273. ),
  274. )
  275. # Otherwise, expect `env_runners/episode_return_mean` to be set.
  276. else:
  277. min_reward = t.stopping_criterion.get(
  278. f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
  279. )
  280. # If min reward not defined, always pass.
  281. if min_reward is None or reward_mean >= min_reward:
  282. passed = True
  283. break
  284. if passed:
  285. print("Regression test PASSED")
  286. break
  287. else:
  288. print("Regression test FAILED on attempt {}".format(i + 1))
  289. if not passed:
  290. print("Overall regression FAILED: Exiting with Error.")
  291. sys.exit(1)