123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- # Create a container object to save model-specific tensors using the policy file above.
- from .base import *
- from deepspeed import comm as dist
- import deepspeed.ops.transformer as transformer_inference
- from deepspeed.accelerator import get_accelerator
- class BaseTransformerMoEContainer(BaseTransformerContainer):
- def __init__(self, **kwargs):
- # Call the init function of the parent class to initialize the tensors and configs from parent class
- super().__init__(**kwargs)
- self.num_experts = self.policy.get_num_experts()
- self.ep_world_size = dist.get_world_size()
- self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size
- self.layer_norm_eps = self.config.layer_norm_eps if hasattr(self.config, 'layer_norm_eps') else 1e-12,
- # MoE models will have a list of mlp related tensors
- self._h4h_w = []
- self._h4h_b = []
- self._4hh_w = []
- self._4hh_b = []
- # Residual MoE needs extra parameters
- self._res_h4h_w = None
- self._res_h4h_b = None
- self._res_4hh_w = None
- self._res_4hh_b = None
- self._res_coef = None
- def create_ds_model_config(self):
- self.set_hidden_heads(*self.policy.get_hidden_heads())
- assert self.num_attention_heads % self.mp_size == 0,\
- "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
- "This is because the attention computation is partitioned evenly among the parallel GPUs."
- self.ds_model_config = transformer_inference.DeepSpeedMoEInferenceConfig(
- hidden_size=self.hidden_size,
- heads=self.num_attention_heads,
- layer_norm_eps=self.layer_norm_eps,
- fp16=self.fp16,
- pre_layer_norm=self.pre_layer_norm,
- mp_size=self.mp_size,
- q_int8=self.quantize,
- moe_experts=self.local_ep_size,
- global_experts=self.num_experts,
- mlp_type=self.config.moe.type,
- scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
- )
- return self.ds_model_config
- def initialize_tensors(self):
- # Set the tensors from policy (user module) to container (DS module)
- self.set_attention(*self.policy.attention())
- self.set_mlp(self.config.moe.type)
- self.set_layernorm(*self.policy.layernorm())
- def set_mlp(self, config_moe_type):
- if config_moe_type == 'standard':
- self._h4h_w, self._h4h_b, \
- self._4hh_w, self._4hh_b = self.policy.mlp()
- else:
- self._h4h_w, self._h4h_b, self._4hh_w, \
- self._4hh_b, self._res_h4h_w, self._res_h4h_b, \
- self._res_4hh_w, self._res_4hh_b, \
- self._res_coef = self.policy.mlp(config_moe_type)
- def transpose(self):
- self.transpose_attention()
- self.transpose_mlp()
- if self.config.moe.type == 'residual':
- self.transpose_residual()
- def transpose_mlp(self):
- self._h4h_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._h4h_w]
- self._4hh_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._4hh_w]
- def transpose_residual(self):
- self._res_h4h_w.data = self.transpose_impl(self._res_h4h_w.data)
- self._res_4hh_w.data = self.transpose_impl(self._res_4hh_w.data)
- self._res_coef.data = self.transpose_impl(self._res_coef.data)
- def apply_tensor_parallelism(self, mp_replace):
- # setup the new Attention module
- self.attention_qkv_mp(mp_replace)
- self.attention_o_mp(mp_replace)
- # quantize attention weights
- self.attention_quantization()
- # setup the new MLP module
- self.mlp_mp()
- def mlp_mp(self):
- gpu_index = dist.get_rank()
- for ep_index in range(self.local_ep_size):
- # mlp inter
- self.module.mlp[ep_index].inter_w.data = self._h4h_w[gpu_index * self.local_ep_size + ep_index].to(
- get_accelerator().current_device_name())
- self.module.mlp[ep_index].inter_b.data = self._h4h_b[gpu_index * self.local_ep_size + ep_index].to(
- get_accelerator().current_device_name())
- # mlp output
- self.module.mlp[ep_index].output_w.data = self._4hh_w[gpu_index * self.local_ep_size + ep_index].to(
- get_accelerator().current_device_name())
- self.module.mlp[ep_index].output_b.data = self._4hh_b[gpu_index * self.local_ep_size + ep_index].to(
- get_accelerator().current_device_name())
- def copy_data_to_new_module(self):
- self.module.attn_nw.data = self.attn_nw.to(get_accelerator().current_device_name())
- self.module.attn_nb.data = self.attn_nb.to(get_accelerator().current_device_name())
- self.module.norm_w.data.copy_(self.input_nw.to(get_accelerator().current_device_name()))
- self.module.norm_b.data.copy_(self.input_nb.to(get_accelerator().current_device_name()))
- if self.config.moe.type == 'residual':
- self.module.res_mlp.inter_w.data = self._res_h4h_w.to(get_accelerator().current_device_name())
- self.module.res_mlp.inter_b.data = self._res_h4h_b.to(get_accelerator().current_device_name())
- self.module.res_mlp.output_w.data = self._res_4hh_w.to(get_accelerator().current_device_name())
- self.module.res_mlp.output_b.data = self._res_4hh_b.to(get_accelerator().current_device_name())
- self.module.res_coef.data = self._res_coef.to(get_accelerator().current_device_name())
|