123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- # slurm-launch.py
- # Usage:
- # python slurm-launch.py --exp-name test \
- # --command "rllib train --run PPO --env CartPole-v0"
- import argparse
- import subprocess
- import sys
- import time
- from pathlib import Path
- template_file = Path(__file__) / "slurm-template.sh"
- JOB_NAME = "${JOB_NAME}"
- NUM_NODES = "${NUM_NODES}"
- NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}"
- PARTITION_OPTION = "${PARTITION_OPTION}"
- COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}"
- GIVEN_NODE = "${GIVEN_NODE}"
- LOAD_ENV = "${LOAD_ENV}"
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--exp-name",
- type=str,
- required=True,
- help="The job name and path to logging file (exp_name.log).",
- )
- parser.add_argument(
- "--num-nodes", "-n", type=int, default=1, help="Number of nodes to use."
- )
- parser.add_argument(
- "--node",
- "-w",
- type=str,
- help="The specified nodes to use. Same format as the "
- "return of 'sinfo'. Default: ''.",
- )
- parser.add_argument(
- "--num-gpus",
- type=int,
- default=0,
- help="Number of GPUs to use in each node. (Default: 0)",
- )
- parser.add_argument(
- "--partition",
- "-p",
- type=str,
- )
- parser.add_argument(
- "--load-env",
- type=str,
- help="The script to load your environment ('module load cuda/10.1')",
- default="",
- )
- parser.add_argument(
- "--command",
- type=str,
- required=True,
- help="The command you wish to execute. For example: "
- " --command 'python test.py'. "
- "Note that the command must be a string.",
- )
- args = parser.parse_args()
- if args.node:
- # assert args.num_nodes == 1
- node_info = "#SBATCH -w {}".format(args.node)
- else:
- node_info = ""
- job_name = "{}_{}".format(
- args.exp_name, time.strftime("%m%d-%H%M", time.localtime())
- )
- partition_option = (
- "#SBATCH --partition={}".format(args.partition) if args.partition else ""
- )
- # ===== Modified the template script =====
- with open(template_file, "r") as f:
- text = f.read()
- text = text.replace(JOB_NAME, job_name)
- text = text.replace(NUM_NODES, str(args.num_nodes))
- text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus))
- text = text.replace(PARTITION_OPTION, partition_option)
- text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
- text = text.replace(LOAD_ENV, str(args.load_env))
- text = text.replace(GIVEN_NODE, node_info)
- text = text.replace(
- "# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " "PRODUCTION!",
- "# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE "
- "RUNNABLE!",
- )
- # ===== Save the script =====
- script_file = "{}.sh".format(job_name)
- with open(script_file, "w") as f:
- f.write(text)
- # ===== Submit the job =====
- print("Starting to submit job!")
- subprocess.Popen(["sbatch", script_file])
- print(
- "Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(
- script_file, "{}.log".format(job_name)
- )
- )
- sys.exit(0)
|