groups.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. # The file has been adapted from https://github.com/NVIDIA/Megatron-LM and retains the following license from the original file
  5. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  6. #
  7. # Licensed under the Apache License, Version 2.0 (the "License");
  8. # you may not use this file except in compliance with the License.
  9. # You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18. """
  19. Support different forms of parallelism in DeepSpeed using multiple process groups.
  20. Given that there are multiple scenarios and use-cases, this file is going to be updated
  21. frequently. For now, the group creation needed for the training scenario is being implemented.
  22. For inference and other new scenarios, the code will be either reused or added to this file.
  23. """
  24. from deepspeed import comm as dist
  25. from deepspeed.utils import log_dist
  26. from deepspeed.utils.exceptions import DeprecatedException
  27. from deepspeed.accelerator import get_accelerator
  28. # Expert parallel group that the current rank belongs to.
  29. _EXPERT_PARALLEL_GROUP = {}
  30. # Expert data parallel group that the current rank belongs to.
  31. _EXPERT_DATA_PARALLEL_GROUP = {}
  32. # dist world group needs to be cloned for some cases
  33. _WORLD_GROUP = None
  34. # ZeRO parameter partitioning group that the current rank belongs to.
  35. _ZERO_PARAM_INTRA_PARALLEL_GROUP = None
  36. # global object to maintain mpu object if passed by a Megatron client
  37. mpu = None
  38. # global object that stores tensor parallel world size for experts
  39. expert_tensor_parallel_world_size = 1
  40. # All to All quantized graident communication groups
  41. _ALL_TO_ALL_GROUP = {}
  42. _DATA_PARALLEL_GROUP = None
  43. # Deprecated groups initialize function.
  44. def initialize(ep_size=1, mpu=None):
  45. """ Deprecated function. Retained to inform the users."""
  46. raise DeprecatedException(
  47. "Please do not use the groups.initialize() API as it is deprecated. Instead, pass the desired ep_size to deepspeed.moe.layer.MoE(..,ep_size,..)"
  48. )
  49. def _ensure_divisibility(numerator, denominator):
  50. """Ensure that numerator is divisible by the denominator."""
  51. assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
  52. # Not currently used. Helper function to create a model (tensor) parallel group.
  53. def _create_model_parallel(model_parallel_size_):
  54. """
  55. Initialize model data parallel groups.
  56. Arguments:
  57. model_parallel_size: number of GPUs used to parallelize model.
  58. Returns:
  59. Tuple of data parallel group and model parallel group
  60. Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
  61. use 2 GPUs to parallelize the model. The present function will
  62. create 4 model parallel groups and 2 data parallel groups as:
  63. 4 model parallel groups:
  64. [g0, g1], [g2, g3], [g4, g5], [g6, g7]
  65. 2 data parallel groups:
  66. [g0, g2, g4, g6], [g1, g3, g5, g7]
  67. Note that for efficiency, the caller should make sure adjacent ranks
  68. are on the same DGX box. For example if we are using 2 DGX-1 boxes
  69. with a total of 16 GPUs, rank 0 to 7 belong to the first box and
  70. ranks 8 to 15 belong to the second box.
  71. """
  72. log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0])
  73. # Get world size and rank. Ensure some consistencies.
  74. assert dist.is_initialized()
  75. world_size = dist.get_world_size()
  76. model_parallel_size = min(model_parallel_size_, world_size)
  77. _ensure_divisibility(world_size, model_parallel_size)
  78. rank = dist.get_rank()
  79. _DATA_PARALLEL_GROUP = None
  80. _MODEL_PARALLEL_GROUP = None
  81. # Build the data parallel groups.
  82. for i in range(model_parallel_size):
  83. ranks = range(i, world_size, model_parallel_size)
  84. group = dist.new_group(ranks)
  85. if i == (rank % model_parallel_size):
  86. _DATA_PARALLEL_GROUP = group
  87. # Build the model parallel groups.
  88. for i in range(world_size // model_parallel_size):
  89. ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
  90. group = dist.new_group(ranks)
  91. if i == (rank // model_parallel_size):
  92. _MODEL_PARALLEL_GROUP = group
  93. return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
  94. def _create_expert_and_data_parallel(expert_parallel_size_):
  95. """
  96. Create expert and data parallel groups.
  97. Note: Caller of this function is responsible to check if the groups already exist.
  98. Example - E + D parallel
  99. world_size = 16
  100. expert_parallel_size = 2 # number of experts in same group
  101. expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params
  102. expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all
  103. data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE
  104. """
  105. assert dist.is_initialized()
  106. log_dist(f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0])
  107. world_size = dist.get_world_size()
  108. rank = dist.get_rank()
  109. _ensure_divisibility(world_size, expert_parallel_size_)
  110. group_name = f"ep_size_{expert_parallel_size_}"
  111. # Build the expert data parallel groups.
  112. global _EXPERT_DATA_PARALLEL_GROUP
  113. # Only create group if it does not already exist
  114. if group_name not in _EXPERT_DATA_PARALLEL_GROUP:
  115. for i in range(expert_parallel_size_):
  116. ranks = range(i, world_size, expert_parallel_size_)
  117. group = dist.new_group(ranks)
  118. log_dist(f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0])
  119. if i == (rank % expert_parallel_size_):
  120. _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
  121. # Build the expert parallel groups.
  122. global _EXPERT_PARALLEL_GROUP
  123. # Only create group if it does not already exist
  124. if group_name not in _EXPERT_PARALLEL_GROUP:
  125. for i in range(world_size // expert_parallel_size_):
  126. ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_)
  127. group = dist.new_group(ranks)
  128. log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0])
  129. if i == (rank // expert_parallel_size_):
  130. _EXPERT_PARALLEL_GROUP[group_name] = group
  131. def _get_expert_parallel_ranks(world_size, model_parallel_size_, expert_parallel_size_):
  132. """Generate expert parallel and expert data parallel group ranks list.
  133. Example - E + M + D parallel
  134. world_size = 16
  135. model_degree = 2
  136. expert_degree = 4 # number of experts in same group
  137. mp_group = [0, 1], [2,3], [4,5] ...
  138. data_parallel_group =[0,2,4,6,8,10, 12,14], [1,3,5,7,9,11,13,15]
  139. expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15]
  140. expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[7,15]
  141. Args:
  142. world_size (int): Distributed world size.
  143. model_parallel_size_ (int): Model parallel group size.
  144. expert_parallel_size_ (int): Expert parallel group size.
  145. Returns:
  146. Expert parallel group ranks and Expert data parallel group ranks list.
  147. """
  148. _ensure_divisibility(world_size, model_parallel_size_)
  149. dp_world_size = world_size // model_parallel_size_
  150. _ensure_divisibility(dp_world_size, expert_parallel_size_)
  151. # Generate data parallel groups
  152. data_parallel_groups = []
  153. dp_group_size = model_parallel_size_
  154. for i in range(dp_group_size):
  155. data_parallel_groups.append(list(range(i, world_size, dp_group_size)))
  156. expert_parallel_groups = []
  157. expert_data_parallel_groups = []
  158. for dp_ranks in data_parallel_groups:
  159. # partition of expert parallel groups, e.g. [0,2,4,6], [8,10,12,14]
  160. part_ep_groups = []
  161. for i in range(0, dp_world_size, expert_parallel_size_):
  162. part_ep_groups.append(dp_ranks[i:i + expert_parallel_size_])
  163. expert_parallel_groups.extend(part_ep_groups)
  164. # zip part_ep_groups get expert data parallel ranks, e.g [0,8],[2,10],[4,12],[6,14]
  165. for expert_dp_ranks in zip(*part_ep_groups):
  166. expert_data_parallel_groups.append(list(expert_dp_ranks))
  167. return expert_parallel_groups, expert_data_parallel_groups
  168. def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
  169. """
  170. Create expert and data parallel groups based on MPU (model parallel) group.
  171. Note: Caller of this function is responsible to check if the groups already exist.
  172. Example - E + M + D parallel
  173. world_size = 16
  174. model_degree = 2
  175. expert_degree = 4 # number of experts in same group
  176. mp_group = [0, 1], [2,3], [4,5] ...
  177. data_parallel_group =[0,2,4,6,8,10, 12,14], [1,3,5,7,9,11,13,15]
  178. expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15]
  179. expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[7,15]
  180. """
  181. assert dist.is_initialized(), "dist is not initialized"
  182. model_parallel_size_ = mpu.get_model_parallel_world_size()
  183. global expert_tensor_parallel_world_size
  184. expert_tensor_parallel_world_size = model_parallel_size_
  185. world_size = dist.get_world_size()
  186. rank = dist.get_rank()
  187. dp_world_size = mpu.get_data_parallel_world_size()
  188. dp_rank = mpu.get_data_parallel_rank()
  189. _ensure_divisibility(world_size, model_parallel_size_)
  190. _ensure_divisibility(dp_world_size, expert_parallel_size_)
  191. log_dist(
  192. f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}",
  193. [0])
  194. global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP
  195. # Get world size and rank. Ensure some consistencies.
  196. _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group()
  197. _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group()
  198. group_name = f"ep_size_{expert_parallel_size_}"
  199. # Only create groups if they don't already exist
  200. # Need to check conditions outside the group creation loop because of the way torch.dist group creation works
  201. if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP:
  202. expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
  203. world_size, model_parallel_size_, expert_parallel_size_)
  204. for ranks in expert_parallel_groups:
  205. group = dist.new_group(ranks)
  206. if rank in list(ranks):
  207. _EXPERT_PARALLEL_GROUP[group_name] = group
  208. for ranks in expert_data_parallel_groups:
  209. group = dist.new_group(ranks)
  210. if rank in list(ranks):
  211. _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
  212. def _get_max_expert_size():
  213. """Get the maximum ep_size from all the created groups."""
  214. assert _EXPERT_PARALLEL_GROUP is not None, "Warning! Process group not initialized"
  215. keylist = []
  216. for key in _EXPERT_PARALLEL_GROUP.keys():
  217. # index 2 is ep_size in the group name: ep_size_<ep_size>
  218. index = 2
  219. keylist.append(int(key.split('_')[index]))
  220. return max(keylist) if len(keylist) > 0 else None
  221. def _get_max_expert_size_name():
  222. """Get the name of the group with max. ep_size"""
  223. return f'ep_size_{_get_max_expert_size()}'
  224. def _get_max_expert_parallel_group():
  225. """Get the max expert parallel size."""
  226. return _get_expert_parallel_group(_get_max_expert_size_name())
  227. def _get_expert_parallel_group(group_name):
  228. """Get the expert parallel group the caller rank belongs to."""
  229. assert group_name in _EXPERT_PARALLEL_GROUP, \
  230. 'expert parallel group is not initialized'
  231. return _EXPERT_PARALLEL_GROUP[group_name]
  232. def _get_expert_parallel_group_dict():
  233. """Get the expert parallel group dict."""
  234. return _EXPERT_PARALLEL_GROUP
  235. def _get_expert_data_parallel_group(group_name):
  236. """Get the expert data parallel group the caller rank belongs to."""
  237. assert group_name in _EXPERT_DATA_PARALLEL_GROUP, \
  238. 'expert data parallel group is not initialized'
  239. return _EXPERT_DATA_PARALLEL_GROUP[group_name]
  240. def _get_expert_data_parallel_group_dict():
  241. """Get the expert data parallel group dict."""
  242. return _EXPERT_DATA_PARALLEL_GROUP
  243. def _clone_world_group():
  244. """Create a clone of the world group
  245. Note: We need to clone the dist world group because we
  246. use dist.get_global_rank() utility function in DeepSpeed at many places.
  247. As that function does not work on dist.group.WORLD, we
  248. need to keep a clone of it.
  249. """
  250. assert dist.is_initialized(), "dist is not initialized"
  251. global _WORLD_GROUP
  252. if _WORLD_GROUP is None:
  253. # If not cloned already, clone the world group
  254. _WORLD_GROUP = dist.new_group(ranks=range(dist.get_world_size()))
  255. return _WORLD_GROUP
  256. def _get_local_all_to_all_group():
  257. assert dist.is_initialized(), 'dist is not initialized'
  258. global _ALL_TO_ALL_GROUP
  259. device_per_node = get_accelerator().device_count()
  260. num_local = dist.get_world_size() // device_per_node
  261. if num_local == 0 and dist.get_world_size() > 0:
  262. assert dist.get_world_size() >= 1, 'num_gpus must >=1, cannot initialize All-To-All'
  263. cur_rank = []
  264. for i in range(dist.get_world_size()):
  265. cur_rank.append(i)
  266. _ALL_TO_ALL_GROUP['local_0'] = dist.new_group(ranks=cur_rank)
  267. elif num_local == 1:
  268. assert dist.get_world_size(
  269. ) == device_per_node, 'num_gpus not equal to device per node, cannot initialize All-To-All'
  270. _ALL_TO_ALL_GROUP['local_0'] = dist.new_group(ranks=[i for i in range(device_per_node)])
  271. else:
  272. assert dist.get_world_size() > device_per_node, 'num_nodes<2 cannot initialize All-To-All'
  273. for i in range(num_local):
  274. local_rank = [j + device_per_node * i for j in range(device_per_node)]
  275. _ALL_TO_ALL_GROUP[f"local_{i}"] = dist.new_group(ranks=local_rank)
  276. for i in range(device_per_node):
  277. cur_rank = []
  278. for j in range(num_local):
  279. cur_rank.append(i + j * device_per_node)
  280. _ALL_TO_ALL_GROUP[f"global_{i}"] = dist.new_group(ranks=cur_rank)
  281. return _ALL_TO_ALL_GROUP
  282. def _get_data_parallel_group():
  283. """Get the data parallel group the caller rank belongs to."""
  284. assert dist.is_initialized(), 'dist is not initialized'
  285. global mpu
  286. if mpu is not None:
  287. return mpu.get_data_parallel_group()
  288. # Return the clone of dist world group
  289. return _clone_world_group()
  290. def _get_broadcast_src_rank():
  291. return dist.get_global_rank(_get_data_parallel_group(), 0)
  292. def _get_expert_broadcast_src_rank(group_name):
  293. return dist.get_global_rank(_get_expert_data_parallel_group(group_name), 0)
  294. def _get_expert_parallel_world_size(group_name):
  295. """Return world size for the expert parallel group."""
  296. return dist.get_world_size(group=_get_expert_parallel_group(group_name))
  297. def _get_expert_data_parallel_world_size(group_name):
  298. """Return world size for the expert data parallel group."""
  299. return dist.get_world_size(group=_get_expert_data_parallel_group(group_name))
  300. def _get_expert_parallel_rank(group_name):
  301. """Return my rank for the expert parallel group."""
  302. return dist.get_rank(group=_get_expert_parallel_group(group_name))
  303. def _get_expert_parallel_src_rank(group_name):
  304. """Calculate the global rank corresponding to a local rank zero
  305. in the expert parallel group."""
  306. global_rank = dist.get_rank()
  307. local_world_size = _get_expert_parallel_world_size(group_name)
  308. return (global_rank // local_world_size) * local_world_size
  309. def _get_expert_data_parallel_rank(group_name):
  310. """Return my rank for the expert data parallel group."""
  311. return dist.get_rank(group=_get_expert_data_parallel_group(group_name))
  312. def _get_data_parallel_world_size():
  313. """Return world size for the data parallel group."""
  314. global mpu
  315. if mpu is not None:
  316. return mpu.get_data_parallel_world_size()
  317. return dist.get_world_size(group=_get_data_parallel_group())
  318. def _get_model_parallel_world_size():
  319. """Return world size for the model parallel group."""
  320. global mpu
  321. if mpu is not None:
  322. return mpu.get_model_parallel_world_size()
  323. return 1
  324. def _get_data_parallel_rank():
  325. """Return my rank for the data parallel group."""
  326. global mpu
  327. if mpu is not None:
  328. return mpu.get_data_parallel_rank()
  329. return dist.get_rank(group=_get_data_parallel_group())
  330. def _get_expert_model_parallel_world_size():
  331. global expert_tensor_parallel_world_size
  332. return expert_tensor_parallel_world_size
  333. def _create_zero_param_parallel_group(group_size):
  334. """
  335. Create parameter partitioning group within ZeRO data parallel groups.
  336. Example - ZP + D parallel
  337. world_size = 16
  338. zero_hpz_partition_size = 2 # number of ranks with with replicated params (dual partitioning)
  339. zero_param_intra_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - segmented (subgroup) with rep partition
  340. data_parallel_group = [0,1,...,15] - all reduce is on ZeRO model
  341. """
  342. assert dist.is_initialized()
  343. global _ZERO_PARAM_INTRA_PARALLEL_GROUP
  344. # Only create group if it does not already exist
  345. assert _ZERO_PARAM_INTRA_PARALLEL_GROUP is None, \
  346. 'ZeRO parameter intra parallel group is already initialized'
  347. world_size = dist.get_world_size()
  348. rank = dist.get_rank()
  349. zero_param_parallel_size_ = min(group_size, world_size)
  350. _ensure_divisibility(world_size, zero_param_parallel_size_)
  351. # Build the ZeRO param intra parallel groups.
  352. for i in range(world_size // zero_param_parallel_size_):
  353. ranks = range(i * zero_param_parallel_size_, (i + 1) * zero_param_parallel_size_)
  354. group = dist.new_group(ranks)
  355. if i == (rank // zero_param_parallel_size_):
  356. _ZERO_PARAM_INTRA_PARALLEL_GROUP = group
  357. def _get_zero_param_intra_parallel_group():
  358. """Get the ZeRO parameter partitioning intra parallel group the caller rank belongs to."""
  359. #assert _ZERO_PARAM_INTRA_PARALLEL_GROUP is not None, \
  360. # 'ZeRO parameter partitioning group is not initialized'
  361. #TODO: Add warning
  362. return _ZERO_PARAM_INTRA_PARALLEL_GROUP
  363. def _zero_param_parallel_is_initialized():
  364. """Check if ZeRO data parallel with parameter partititioning groups are initialized."""
  365. ###TODO: assert that MPU is not set
  366. if _ZERO_PARAM_INTRA_PARALLEL_GROUP is None and _DATA_PARALLEL_GROUP is None:
  367. return False
  368. def _get_zero_param_intra_parallel_rank_in_mygroup():
  369. """Return my rank for the ZeRO parameter inter parallel group."""
  370. return dist.get_rank(group=_get_zero_param_intra_parallel_group())
  371. def _get_zero_param_intra_parallel_group_world_size():
  372. """Return world size for the ZeRO parameter parallel group."""
  373. return dist.get_world_size(group=_get_zero_param_intra_parallel_group())
  374. def _get_zero_param_intra_parallel_group_ranks():
  375. """Return all ranks for the ZeRO parameter intra parallel group."""
  376. return dist.get_all_ranks_from_group(group=_get_zero_param_intra_parallel_group())