12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- """Durable trainable (16 trials, checkpoint to cloud)
- In this run, we will start 16 trials on a cluster. The trials create
- 10 MB checkpoints every 12 seconds and should only keep 2 of these. This test
- ensures that durable checkpoints don't slow down experiment progress too much.
- Cluster: cluster_16x2.yaml
- Test owner: krfricke
- Acceptance criteria: Should run faster than 500 seconds.
- Theoretical minimum time: 300 seconds
- """
- import argparse
- import os
- import ray
- from ray.tune.utils.release_test_util import timed_tune_run
- def main(bucket):
- secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt")
- if os.path.isfile(secrets_file):
- print(f"Loading AWS secrets from file {secrets_file}")
- from configparser import ConfigParser
- config = ConfigParser()
- config.read(secrets_file)
- for k, v in config.items():
- for x, y in v.items():
- var = str(x).upper()
- os.environ[var] = str(y)
- else:
- print("No AWS secrets file found. Loading from boto.")
- try:
- from boto3 import Session
- session = Session()
- credentials = session.get_credentials()
- current_credentials = credentials.get_frozen_credentials()
- os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key
- os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key
- os.environ["AWS_SESSION_TOKEN"] = current_credentials.token
- except Exception:
- print("Cannot setup AWS credentials (is this running on GCE?)")
- if all(
- os.getenv(k, "")
- for k in [
- "AWS_ACCESS_KEY_ID",
- "AWS_SECRET_ACCESS_KEY",
- "AWS_SESSION_TOKEN",
- ]
- ):
- print("AWS secrets found in env.")
- else:
- print("Warning: No AWS secrets found in env!")
- ray.init(address="auto")
- num_samples = 16
- results_per_second = 5 / 60 # 5 results per minute = 1 every 12 seconds
- trial_length_s = 300
- max_runtime = 650
- timed_tune_run(
- name="durable trainable",
- num_samples=num_samples,
- results_per_second=results_per_second,
- trial_length_s=trial_length_s,
- max_runtime=max_runtime,
- checkpoint_freq_s=12, # Once every 12 seconds (once per result)
- checkpoint_size_b=int(10 * 1000**2), # 10 MB
- keep_checkpoints_num=2,
- resources_per_trial={"cpu": 2},
- storage_path=bucket,
- )
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--bucket", type=str, help="Bucket name")
- args, _ = parser.parse_known_args()
- main(args.bucket or "ray-tune-scalability-test")
|