123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- def bwc_tensor_model_parallel_rank(mpu=None):
- """Backwards-compatible way of querying the tensor model parallel rank from
- an ``mpu`` object.
- *Tensor* model parallelism means that tensors are physically split across
- processes. This contrasts with *pipeline* model parallelism, in which the
- layers are partitioned but tensors left intact.
- The API for tensor model parallelism has changed across versions and this
- helper provides a best-effort implementation across versions of ``mpu``
- objects. The preferred mechanism is
- ``mpu.get_tensor_model_parallel_rank()``.
- This should "just work" with both Megatron-LM and DeepSpeed's pipeline
- parallelism.
- Args:
- mpu (model parallel unit, optional): The tensor model parallel rank.
- If ``mpu=None``, returns 0. Defaults to ``None``.
- Returns:
- int: the rank
- """
- if mpu is None:
- # No model parallelism in easy :)
- return 0
- if hasattr(mpu, 'get_tensor_model_parallel_rank'):
- # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
- return mpu.get_tensor_model_parallel_rank()
- elif hasattr(mpu, 'get_slice_parallel_rank'):
- # Some DeepSpeed + pipeline parallelism versions
- return mpu.get_slice_parallel_rank()
- else:
- # Deprecated Megatron and DeepSpeed convention
- return mpu.get_model_parallel_rank()
- def bwc_tensor_model_parallel_world_size(mpu=None):
- """Backwards-compatible way of querying the tensor model parallel world size.
- Similar to bwc_tensor_model_parallel_rank.
- """
- if mpu is None:
- return 1
- if hasattr(mpu, 'get_tensor_model_parallel_world_size'):
- # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
- return mpu.get_tensor_model_parallel_world_size()
- elif hasattr(mpu, 'get_slice_parallel_world_size'):
- # Some DeepSpeed + pipeline parallelism versions
- return mpu.get_slice_parallel_world_size()
- else:
- # Deprecated Megatron and DeepSpeed convention
- return mpu.get_model_parallel_world_size()
- def bwc_tensor_model_parallel_group(mpu=None):
- """Backwards-compatible way of querying the tensor model parallel group.
- Similar to bwc_tensor_model_parallel_rank.
- """
- if mpu is None:
- return None
- if hasattr(mpu, 'get_tensor_model_parallel_group'):
- # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
- return mpu.get_tensor_model_parallel_group()
- elif hasattr(mpu, 'get_slice_parallel_group'):
- # Some DeepSpeed + pipeline parallelism versions
- return mpu.get_slice_parallel_group()
- else:
- # Deprecated Megatron and DeepSpeed convention
- return mpu.get_model_parallel_group()
- def bwc_pipeline_parallel_world_size(mpu=None):
- """Backwards-compatible way of querying the pipeline parallel world size."""
- world_size = 1
- if mpu is not None:
- if hasattr(mpu, 'get_pipeline_model_parallel_world_size'):
- # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
- world_size = mpu.get_pipeline_model_parallel_world_size()
- elif hasattr(mpu, 'get_pipe_parallel_world_size'):
- # DeepSpeed Topology
- world_size = mpu.get_pipe_parallel_world_size()
- return world_size
- def bwc_pipeline_parallel_group(mpu=None):
- """Backwards-compatible way of querying the pipeline parallel group."""
- if mpu is None:
- return None
- if hasattr(mpu, 'get_pipeline_model_parallel_group'):
- # Megatron
- return mpu.get_pipeline_model_parallel_group()
- elif hasattr(mpu, 'get_pipe_parallel_group'):
- # DeepSpeed Topology
- return mpu.get_pipe_parallel_group()
- assert False, 'mpu does not support pipeline parallel group'
|