#!/usr/bin/env python # Runs one or more regression tests. Retries tests up to 3 times. # # Example usage: # $ python run_regression_tests.py regression-tests/cartpole-es-[tf|torch].yaml # # When using in BAZEL (with py_test), e.g. see in ray/rllib/BUILD: # py_test( # name = "run_regression_tests", # main = "tests/run_regression_tests.py", # tags = ["learning_tests"], # size = "medium", # 5min timeout # srcs = ["tests/run_regression_tests.py"], # data = glob(["tuned_examples/regression_tests/*.yaml"]), # # Pass `BAZEL` option and the path to look for yaml regression files. # args = ["BAZEL", "tuned_examples/regression_tests"] # ) import argparse import os from pathlib import Path import sys import re import yaml import ray from ray import air from ray.air.integrations.wandb import WandbLoggerCallback from ray.rllib import _register_all from ray.rllib.common import SupportedFileType from ray.rllib.train import load_experiments_from_file from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, EVALUATION_RESULTS, ) from ray.tune import run_experiments parser = argparse.ArgumentParser() parser.add_argument( "--framework", type=str, choices=["torch", "tf2", "tf"], default=None, help="The deep learning framework to use. If not provided, try using the one " "specified in the file, otherwise, use RLlib's default: `torch`.", ) parser.add_argument( "--dir", type=str, required=True, help="The directory or file in which to find all tests.", ) parser.add_argument( "--env", type=str, default=None, help="An optional env override setting. If not provided, try using the one " "specified in the file.", ) parser.add_argument("--num-cpus", type=int, default=None) parser.add_argument( "--local-mode", action="store_true", help="Run ray in local mode for easier debugging.", ) parser.add_argument( "--num-samples", type=int, default=1, help="The number of seeds/samples to run with the given experiment config.", ) parser.add_argument( "--override-mean-reward", type=float, default=0.0, help=( "Override the mean reward specified by the yaml file in the stopping criteria. " "This is particularly useful for timed tests." ), ) parser.add_argument( "--verbose", type=int, default=2, help="The verbosity level for the main `tune.run_experiments()` call.", ) parser.add_argument( "--wandb-key", type=str, default=None, help="The WandB API key to use for uploading results.", ) parser.add_argument( "--wandb-project", type=str, default=None, help="The WandB project name to use.", ) parser.add_argument( "--wandb-run-name", type=str, default=None, help="The WandB run name to use.", ) # parser.add_argument( # "--wandb-from-checkpoint", # type=str, # default=None, # help=( # "The WandB checkpoint location (e.g. `[team name]/[project name]/checkpoint_" # "[run name]:v[version]`) from which to resume an experiment." # ), # ) parser.add_argument( "--checkpoint-freq", type=int, default=0, help=( "The frequency (in training iterations) with which to create checkpoints. " "Note that if --wandb-key is provided, these checkpoints will automatically " "be uploaded to WandB." ), ) # Obsoleted arg, use --dir instead. parser.add_argument("--yaml-dir", type=str, default="") if __name__ == "__main__": args = parser.parse_args() if args.yaml_dir != "": deprecation_warning(old="--yaml-dir", new="--dir", error=True) # Bazel regression test mode: Get path to look for yaml files. # Get the path or single file to use. rllib_dir = Path(__file__).parent.parent print(f"rllib dir={rllib_dir}") abs_path = os.path.join(rllib_dir, args.dir) # Single file given. if os.path.isfile(abs_path): files = [abs_path] # Path given -> Get all yaml files in there via rglob. elif os.path.isdir(abs_path): files = [] for type_ in ["yaml", "yml", "py"]: files += list(rllib_dir.rglob(args.dir + f"/*.{type_}")) files = sorted(map(lambda path: str(path.absolute()), files), reverse=True) # Given path/file does not exist. else: raise ValueError(f"--dir ({args.dir}) not found!") print("Will run the following regression tests:") for file in files: print("->", file) # Loop through all collected files. for file in files: config_is_python = False # For python files, need to make sure, we only deliver the module name into the # `load_experiments_from_file` function (everything from "/ray/rllib" on). if file.endswith(".py"): if file.endswith("__init__.py"): # weird CI learning test (BAZEL) case continue experiments = load_experiments_from_file(file, SupportedFileType.python) config_is_python = True else: experiments = load_experiments_from_file(file, SupportedFileType.yaml) assert ( len(experiments) == 1 ), "Error, can only run a single experiment per file!" exp = list(experiments.values())[0] exp_name = list(experiments.keys())[0] # Set the number of samples to run. exp["num_samples"] = args.num_samples # Make sure there is a config and a stopping criterium. exp["config"] = exp.get("config", {}) exp["stop"] = exp.get("stop", {}) # Override framework setting with the command line one, if provided. # Otherwise, will use framework setting in file (or default: torch). if args.framework is not None: exp["config"]["framework"] = args.framework # Override env setting if given on command line. if args.env is not None: exp["config"]["env"] = args.env else: exp["config"]["env"] = exp["env"] # Override the mean reward if specified. This is used by the ray ci # for overriding the episode reward mean for tf2 tests for off policy # long learning tests such as sac and ddpg on the pendulum environment. if args.override_mean_reward != 0.0: exp["stop"][ f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" ] = args.override_mean_reward # Checkpoint settings. exp["checkpoint_config"] = air.CheckpointConfig( checkpoint_frequency=args.checkpoint_freq, checkpoint_at_end=args.checkpoint_freq > 0, ) # Always run with eager-tracing when framework=tf2, if not in local-mode # and unless the yaml explicitly tells us to disable eager tracing. if ( (args.framework == "tf2" or exp["config"].get("framework") == "tf2") and not args.local_mode # Note: This check will always fail for python configs, b/c normally, # algorithm configs have `self.eager_tracing=False` by default. # Thus, you'd have to set `eager_tracing` to True explicitly in your python # config to make sure we are indeed using eager tracing. and exp["config"].get("eager_tracing") is not False ): exp["config"]["eager_tracing"] = True # Print out the actual config (not for py files as yaml.dump weirdly fails). if not config_is_python: print("== Test config ==") print(yaml.dump(experiments)) callbacks = None if args.wandb_key is not None: project = args.wandb_project or ( exp["run"].lower() + "-" + re.sub("\\W+", "-", exp["config"]["env"].lower()) if config_is_python else list(experiments.keys())[0] ) callbacks = [ WandbLoggerCallback( api_key=args.wandb_key, project=project, upload_checkpoints=True, **({"name": args.wandb_run_name} if args.wandb_run_name else {}), ) ] # Try running each test 3 times and make sure it reaches the given # reward. passed = False for i in range(3): # Try starting a new ray cluster. try: ray.init(num_cpus=args.num_cpus, local_mode=args.local_mode) # Allow running this script on existing cluster as well. except ConnectionError: ray.init() else: try: trials = run_experiments( experiments, resume=False, verbose=args.verbose, callbacks=callbacks, ) finally: ray.shutdown() _register_all() for t in trials: # If we have evaluation workers, use their rewards. # This is useful for offline learning tests, where # we evaluate against an actual environment. check_eval = bool(exp["config"].get("evaluation_interval")) reward_mean = ( t.last_result[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][ EPISODE_RETURN_MEAN ] if check_eval else ( # Some algos don't store sampler results under `env_runners` # e.g. ARS. Need to keep this logic around for now. t.last_result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] if ENV_RUNNER_RESULTS in t.last_result else t.last_result[EPISODE_RETURN_MEAN] ) ) # If we are using evaluation workers, we may have # a stopping criterion under the "evaluation/" scope. If # not, use `episode_return_mean`. if check_eval: min_reward = t.stopping_criterion.get( f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/" f"{EPISODE_RETURN_MEAN}", t.stopping_criterion.get( f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" ), ) # Otherwise, expect `env_runners/episode_return_mean` to be set. else: min_reward = t.stopping_criterion.get( f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" ) # If min reward not defined, always pass. if min_reward is None or reward_mean >= min_reward: passed = True break if passed: print("Regression test PASSED") break else: print("Regression test FAILED on attempt {}".format(i + 1)) if not passed: print("Overall regression FAILED: Exiting with Error.") sys.exit(1)