runner.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. # Copyright 2020 The Microsoft DeepSpeed Team
  2. """
  3. DeepSpeed runner is the main front-end to launching multi-worker
  4. training jobs with DeepSpeed. By default this uses pdsh to parallel
  5. ssh into multiple worker nodes and launch all the necessary processes
  6. per rank for training.
  7. """
  8. import os
  9. import re
  10. import sys
  11. import json
  12. import base64
  13. import argparse
  14. import subprocess
  15. import collections
  16. from copy import deepcopy
  17. import signal
  18. import time
  19. from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner
  20. from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER
  21. from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
  22. from ..nebula.constants import NEBULA_EXPORT_ENVS
  23. from ..utils import logger
  24. from ..autotuning import Autotuner
  25. from deepspeed.accelerator import get_accelerator
  26. DLTS_HOSTFILE = "/job/hostfile"
  27. EXPORT_ENVS = ['MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX']
  28. EXPORT_ENVS += NEBULA_EXPORT_ENVS
  29. DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
  30. DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
  31. PDSH_MAX_FAN_OUT = 1024
  32. def parse_args(args=None):
  33. parser = argparse.ArgumentParser(
  34. description="DeepSpeed runner to help launch distributed "
  35. "multi-node/multi-gpu training jobs.")
  36. parser.add_argument("-H",
  37. "--hostfile",
  38. type=str,
  39. default=DLTS_HOSTFILE,
  40. help="Hostfile path (in MPI style) that defines the "
  41. "resource pool available to the job (e.g., "
  42. "worker-0 slots=4)")
  43. parser.add_argument("-i",
  44. "--include",
  45. type=str,
  46. default="",
  47. help='''Specify hardware resources to use during execution.
  48. String format is
  49. NODE_SPEC[@NODE_SPEC ...],
  50. where
  51. NODE_SPEC=NAME[:SLOT[,SLOT ...]].
  52. If :SLOT is omitted, include all slots on that host.
  53. Example: -i "worker-0@worker-1:0,2" will use all slots
  54. on worker-0 and slots [0, 2] on worker-1.
  55. ''')
  56. parser.add_argument("-e",
  57. "--exclude",
  58. type=str,
  59. default="",
  60. help='''Specify hardware resources to NOT use during execution.
  61. Mutually exclusive with --include. Resource formatting
  62. is the same as --include.
  63. Example: -e "worker-1:0" will use all available
  64. resources except slot 0 on worker-1.
  65. ''')
  66. parser.add_argument("--num_nodes",
  67. type=int,
  68. default=-1,
  69. help="Total number of worker nodes to run on, this will use "
  70. "the top N hosts from the given hostfile.")
  71. parser.add_argument("--min_elastic_nodes",
  72. type=int,
  73. default=-1,
  74. help="Minimum number of nodes to run elastic training on. "
  75. "Default is 1 when elastic training is enabled")
  76. parser.add_argument("--max_elastic_nodes",
  77. type=int,
  78. default=-1,
  79. help="Maximum number of nodes to run elastic training on. "
  80. "Default is num_nodes when elastic training is enabled")
  81. parser.add_argument("--num_gpus",
  82. type=int,
  83. default=-1,
  84. help="Max number of GPUs to use on each node, will use "
  85. "[0:N) GPU ids on each node.")
  86. parser.add_argument("--master_port",
  87. default=TORCH_DISTRIBUTED_DEFAULT_PORT,
  88. type=int,
  89. help="(optional) Port used by PyTorch distributed for "
  90. "communication during training.")
  91. parser.add_argument("--master_addr",
  92. default="",
  93. type=str,
  94. help="(optional) IP address of node 0, will be "
  95. "inferred via 'hostname -I' if not specified.")
  96. parser.add_argument(
  97. "--launcher",
  98. default=PDSH_LAUNCHER,
  99. type=str,
  100. help="(optional) choose launcher backend for multi-node "
  101. "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.")
  102. parser.add_argument("--launcher_args",
  103. default="",
  104. type=str,
  105. help="(optional) pass launcher specific arguments as a "
  106. "single quoted argument.")
  107. parser.add_argument("--module",
  108. action="store_true",
  109. help="Change each process to interpret the launch "
  110. "script as a Python module, executing with the same "
  111. "behavior as 'python -m'.")
  112. parser.add_argument("--no_python",
  113. action="store_true",
  114. help="Skip prepending the training script with "
  115. "'python' - just execute it directly.")
  116. parser.add_argument("--no_local_rank",
  117. action="store_true",
  118. help="Do not pass local_rank as an argument when calling "
  119. "the user's training script.")
  120. parser.add_argument("--no_ssh_check",
  121. action="store_true",
  122. help="Do not perform ssh check in multi-node launcher model")
  123. parser.add_argument("--force_multi",
  124. action="store_true",
  125. help="Force multi-node launcher mode, helps in cases where user "
  126. "wants to launch on single remote node.")
  127. parser.add_argument(
  128. "--save_pid",
  129. action="store_true",
  130. help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
  131. "where <main-pid> is the pid of the first process that invoked `deepspeed`. "
  132. "Useful when launching deepspeed processes programmatically.")
  133. parser.add_argument(
  134. "--enable_each_rank_log",
  135. default="None",
  136. type=str,
  137. help="redirect the stdout and stderr from each rank into different log files")
  138. parser.add_argument(
  139. "--autotuning",
  140. default="",
  141. choices=["tune",
  142. "run"],
  143. type=str,
  144. help="Run DeepSpeed autotuner to discover optimal configuration parameters "
  145. "before running job.")
  146. parser.add_argument("--elastic_training",
  147. action="store_true",
  148. help="Enable elastic training support in DeepSpeed.")
  149. parser.add_argument("user_script",
  150. type=str,
  151. help="User script to launch, followed by any required "
  152. "arguments.")
  153. parser.add_argument('user_args', nargs=argparse.REMAINDER)
  154. return parser.parse_args(args=args)
  155. def fetch_hostfile(hostfile_path):
  156. if not os.path.isfile(hostfile_path):
  157. logger.warning("Unable to find hostfile, will proceed with training "
  158. "with local resources only.")
  159. return None
  160. # e.g., worker-0 slots=16
  161. with open(hostfile_path, 'r') as fd:
  162. hostfile_text = fd.readlines()
  163. return _parse_hostfile(hostfile_text)
  164. def _parse_hostfile(hostfile_lines):
  165. # Regex matches one or more non-whitespace characters (\S+) at the start of
  166. # the line, followed by one or more whitespace characters (\s+), followed
  167. # by the string "slots=", followed by one or more digits (\d+).
  168. pattern = r'^(\S+)\s+slots=(\d+)'
  169. resource_pool = collections.OrderedDict()
  170. for line in hostfile_lines:
  171. line = line.strip()
  172. match = re.search(pattern, line)
  173. if line.startswith("#") or line == "":
  174. # hostfile comment or empty line, ignore
  175. continue
  176. elif match:
  177. host = match.group(1)
  178. num_slots = int(match.group(2))
  179. if host in resource_pool:
  180. logger.error(f"Bad hostfile text: {hostfile_lines}")
  181. raise ValueError(
  182. f"Hostfile contains multiple entries for {host}, unable to proceed with launching"
  183. )
  184. resource_pool[host] = num_slots
  185. else:
  186. logger.error(f"Bad hostfile text: {hostfile_lines}")
  187. raise ValueError(
  188. "Hostfile contains a bad entry: {line}, unable to proceed with launching"
  189. )
  190. if len(resource_pool) == 0:
  191. logger.error(f"Bad hostfile text: {hostfile_lines}")
  192. raise ValueError(
  193. "Hostfile is empty or not formatted correctly, unable to proceed with launching."
  194. )
  195. return resource_pool
  196. def _stable_remove_duplicates(data):
  197. # Create a new list in the same order as original but with duplicates
  198. # removed, should never be more than ~16 elements so simple is best
  199. new_list = []
  200. for x in data:
  201. if x not in new_list:
  202. new_list.append(x)
  203. return new_list
  204. def parse_resource_filter(host_info, include_str="", exclude_str=""):
  205. '''Parse an inclusion or exclusion string and filter a hostfile dictionary.
  206. String format is NODE_SPEC[@NODE_SPEC ...], where
  207. NODE_SPEC = NAME[:SLOT[,SLOT ...]].
  208. If :SLOT is omitted, include/exclude all slots on that host.
  209. Examples:
  210. include_str="worker-0@worker-1:0,2" will use all slots on worker-0 and
  211. slots [0, 2] on worker-1.
  212. exclude_str="worker-1:0" will use all available resources except
  213. slot 0 on worker-1.
  214. '''
  215. # Constants that define our syntax
  216. NODE_SEP = '@'
  217. SLOT_LIST_START = ':'
  218. SLOT_SEP = ','
  219. # Ensure include/exclude are mutually exclusive
  220. if (include_str != "") and (exclude_str != ""):
  221. raise ValueError('include_str and exclude_str are mutually exclusive.')
  222. # no-op
  223. if (include_str == "") and (exclude_str == ""):
  224. return host_info
  225. # Either build from scratch or remove items
  226. filtered_hosts = dict()
  227. if include_str:
  228. parse_str = include_str
  229. if exclude_str != "":
  230. filtered_hosts = deepcopy(host_info)
  231. parse_str = exclude_str
  232. # foreach node in the list
  233. for node_config in parse_str.split(NODE_SEP):
  234. # Node can either be alone or node:slot,slot,slot
  235. if SLOT_LIST_START in node_config:
  236. hostname, slots = node_config.split(SLOT_LIST_START)
  237. slots = [int(x) for x in slots.split(SLOT_SEP)]
  238. # sanity checks
  239. if hostname not in host_info:
  240. raise ValueError(f"Hostname '{hostname}' not found in hostfile")
  241. for slot in slots:
  242. if slot not in host_info[hostname]:
  243. raise ValueError(f"No slot '{slot}' specified on host '{hostname}'")
  244. # If include string, build the list from here
  245. if include_str:
  246. filtered_hosts[hostname] = slots
  247. elif exclude_str:
  248. for slot in slots:
  249. logger.info(f'removing {slot} from {hostname}')
  250. filtered_hosts[hostname].remove(slot)
  251. # User just specified the whole node
  252. else:
  253. hostname = node_config
  254. # sanity check hostname
  255. if hostname not in host_info:
  256. raise ValueError(f"Hostname '{hostname}' not found in hostfile")
  257. if include_str:
  258. filtered_hosts[hostname] = host_info[hostname]
  259. elif exclude_str:
  260. filtered_hosts[hostname] = []
  261. # Post-processing to remove duplicates and empty nodes
  262. del_keys = []
  263. for hostname in filtered_hosts:
  264. # Remove duplicates
  265. filtered_hosts[hostname] = _stable_remove_duplicates(filtered_hosts[hostname])
  266. # Remove empty hosts
  267. if len(filtered_hosts[hostname]) == 0:
  268. del_keys.append(hostname)
  269. for name in del_keys:
  270. del filtered_hosts[name]
  271. # Lastly, go over filtered_hosts and convert to a OrderedDict() to ensure
  272. # we map ranks to nodes correctly by maintaining host_info ordering.
  273. ordered_hosts = collections.OrderedDict()
  274. for host in host_info:
  275. if host in filtered_hosts:
  276. ordered_hosts[host] = filtered_hosts[host]
  277. return ordered_hosts
  278. def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
  279. active_resources = collections.OrderedDict()
  280. for hostname, slots in resource_pool.items():
  281. active_resources[hostname] = list(range(slots))
  282. return parse_resource_filter(active_resources,
  283. include_str=inclusion,
  284. exclude_str=exclusion)
  285. def encode_world_info(world_info):
  286. world_info_json = json.dumps(world_info).encode('utf-8')
  287. world_info_base64 = base64.urlsafe_b64encode(world_info_json).decode('utf-8')
  288. return world_info_base64
  289. def run_autotuning(args, active_resources):
  290. tuner = Autotuner(args, active_resources)
  291. logger.info("[Start] Running autotuning")
  292. tuner.tune()
  293. tuner.print_tuning_results()
  294. logger.info("[End] Running autotuning")
  295. tuner.write_optimal_config()
  296. if args.autotuning == "run":
  297. tuner.run_after_tuning()
  298. def parse_num_nodes(str_num_nodes: str, elastic_training: bool):
  299. node_list = str_num_nodes.split(":")
  300. if len(node_list) == 1:
  301. min_nodes, max_nodes = int(node_list[0]), -1
  302. elif len(node_list) == 2 and elastic_training:
  303. min_nodes, max_nodes = int(node_list[0]), int(node_list[1])
  304. elif len(node_list) == 2 and not elastic_training:
  305. raise RuntimeError("MIN:MAX format is only supported in elastic training")
  306. else:
  307. raise RuntimeError("num_nodes {} is not in MIN:MAX format".format(str_num_nodes))
  308. return min_nodes, max_nodes
  309. def main(args=None):
  310. args = parse_args(args)
  311. if args.elastic_training:
  312. assert args.master_addr != "", "Master Addr is required when elastic training is enabled"
  313. resource_pool = fetch_hostfile(args.hostfile)
  314. # respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters
  315. cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
  316. if not resource_pool and len(cuda_visible_devices):
  317. detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
  318. if len(args.include) or len(
  319. args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
  320. print(
  321. f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
  322. )
  323. else:
  324. args.include = f"localhost:{cuda_visible_devices}"
  325. print(f"{detected_str}: setting --include={args.include}")
  326. del os.environ["CUDA_VISIBLE_DEVICES"]
  327. if args.num_nodes >= 0 or args.num_gpus >= 0:
  328. if args.include != "" or args.exclude != "":
  329. raise ValueError("Cannot specify num_nodes/gpus with include/exclude")
  330. multi_node_exec = True
  331. if not resource_pool:
  332. resource_pool = {}
  333. device_count = get_accelerator().device_count()
  334. if device_count == 0:
  335. raise RuntimeError("Unable to proceed, no GPU resources available")
  336. resource_pool['localhost'] = device_count
  337. args.master_addr = "127.0.0.1"
  338. multi_node_exec = False
  339. if not multi_node_exec and args.num_nodes > 1:
  340. raise ValueError("Num nodes is >1 but no extra nodes available via hostfile")
  341. active_resources = parse_inclusion_exclusion(resource_pool,
  342. args.include,
  343. args.exclude)
  344. env = os.environ.copy()
  345. # validate that passwordless-ssh is workly properly with this hostfile
  346. if multi_node_exec and not args.no_ssh_check:
  347. first_host = list(active_resources.keys())[0]
  348. try:
  349. subprocess.check_call(
  350. f'ssh -o PasswordAuthentication=no {first_host} hostname',
  351. stderr=subprocess.DEVNULL,
  352. stdout=subprocess.DEVNULL,
  353. shell=True)
  354. except subprocess.CalledProcessError:
  355. raise RuntimeError(
  356. f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh."
  357. )
  358. if not args.master_addr:
  359. assert multi_node_exec
  360. first_host = list(active_resources.keys())[0]
  361. hostname_cmd = [f"ssh {first_host} hostname -I"]
  362. try:
  363. result = subprocess.check_output(hostname_cmd, shell=True)
  364. except subprocess.CalledProcessError as err:
  365. logger.error(
  366. "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
  367. )
  368. raise err
  369. args.master_addr = result.decode('utf-8').split()[0]
  370. if not args.master_addr:
  371. raise RuntimeError(
  372. f"Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
  373. )
  374. logger.info(f"Using IP address of {args.master_addr} for node {first_host}")
  375. if args.autotuning != "":
  376. run_autotuning(args, active_resources)
  377. return
  378. if args.num_nodes > 0:
  379. updated_active_resources = collections.OrderedDict()
  380. for count, hostname in enumerate(active_resources.keys()):
  381. if args.num_nodes == count:
  382. break
  383. updated_active_resources[hostname] = active_resources[hostname]
  384. active_resources = updated_active_resources
  385. if args.num_gpus > 0:
  386. updated_active_resources = collections.OrderedDict()
  387. for hostname in active_resources.keys():
  388. updated_active_resources[hostname] = list(range(args.num_gpus))
  389. active_resources = updated_active_resources
  390. if args.elastic_training:
  391. assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training"
  392. # encode world info as base64 to make it easier to pass via command line
  393. world_info_base64 = encode_world_info(active_resources)
  394. multi_node_exec = args.force_multi or len(active_resources) > 1
  395. if not multi_node_exec:
  396. deepspeed_launch = [
  397. sys.executable,
  398. "-u",
  399. "-m",
  400. "deepspeed.launcher.launch",
  401. f"--world_info={world_info_base64}",
  402. f"--master_addr={args.master_addr}",
  403. f"--master_port={args.master_port}"
  404. ]
  405. if args.no_python:
  406. deepspeed_launch.append("--no_python")
  407. if args.module:
  408. deepspeed_launch.append("--module")
  409. if args.no_local_rank:
  410. deepspeed_launch.append("--no_local_rank")
  411. if args.save_pid:
  412. deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
  413. if args.enable_each_rank_log:
  414. deepspeed_launch.append(
  415. f"--enable_each_rank_log={args.enable_each_rank_log}")
  416. if args.elastic_training:
  417. deepspeed_launch.append("--enable_elastic_training")
  418. deepspeed_launch.append(f"--max_elastic_nodes={args.max_elastic_nodes}")
  419. deepspeed_launch.append(f"--min_elastic_nodes={args.min_elastic_nodes}")
  420. cmd = deepspeed_launch + [args.user_script] + args.user_args
  421. else:
  422. args.launcher = args.launcher.lower()
  423. if args.launcher == PDSH_LAUNCHER:
  424. runner = PDSHRunner(args, world_info_base64)
  425. elif args.launcher == OPENMPI_LAUNCHER:
  426. runner = OpenMPIRunner(args, world_info_base64, resource_pool)
  427. elif args.launcher == MPICH_LAUNCHER:
  428. runner = MPICHRunner(args, world_info_base64, resource_pool)
  429. elif args.launcher == MVAPICH_LAUNCHER:
  430. runner = MVAPICHRunner(args, world_info_base64, resource_pool)
  431. elif args.launcher == SLURM_LAUNCHER:
  432. runner = SlurmRunner(args, world_info_base64, resource_pool)
  433. else:
  434. raise NotImplementedError(f"Unknown launcher {args.launcher}")
  435. if not runner.backend_exists():
  436. raise RuntimeError(f"launcher '{args.launcher}' not installed.")
  437. curr_path = os.path.abspath('.')
  438. if 'PYTHONPATH' in env:
  439. env['PYTHONPATH'] = curr_path + ":" + env['PYTHONPATH']
  440. else:
  441. env['PYTHONPATH'] = curr_path
  442. exports = ""
  443. for var in env.keys():
  444. if any([var.startswith(name) for name in EXPORT_ENVS]):
  445. runner.add_export(var, env[var])
  446. for environ_path in DEEPSPEED_ENVIRONMENT_PATHS:
  447. environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME)
  448. if os.path.isfile(environ_file):
  449. with open(environ_file, 'r') as fd:
  450. for var in fd.readlines():
  451. key, val = var.split('=', maxsplit=1)
  452. runner.add_export(key, val)
  453. if args.launcher == PDSH_LAUNCHER:
  454. cmd, kill_cmd = runner.get_cmd(env, active_resources)
  455. else:
  456. cmd = runner.get_cmd(env, active_resources)
  457. logger.info(f"cmd = {' '.join(cmd)}")
  458. result = subprocess.Popen(cmd, env=env)
  459. def sigkill_handler(signum, frame):
  460. result.send_signal(signal.SIGINT)
  461. time.sleep(0.1)
  462. result.send_signal(signal.SIGTERM)
  463. result_kill = subprocess.Popen(kill_cmd, env=env)
  464. result_kill.wait()
  465. time.sleep(1)
  466. sys.exit(1)
  467. if args.launcher == PDSH_LAUNCHER:
  468. signal.signal(signal.SIGINT, sigkill_handler)
  469. result.wait()
  470. # In case of failure must propagate the error-condition back to the caller (usually shell). The
  471. # actual error and traceback should have been printed in the subprocess, so in order to avoid
  472. # unnecessary noise we just quietly exit here with the same code as the subprocess
  473. if result.returncode > 0:
  474. sys.exit(result.returncode)
  475. if __name__ == "__main__":
  476. main()