megatron_gpt_moe.py 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. '''Copyright The Microsoft DeepSpeed Team'''
  2. from .base import *
  3. from .base_moe import *
  4. from .features.megatron import MegatronContainer
  5. from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
  6. import torch
  7. from .megatron_gpt import MegatronLayerPolicy
  8. from packaging import version as pkg_version
  9. class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
  10. def __init__(self, policy, config, model_config, layer_id):
  11. super().__init__(policy, config, model_config, layer_id)
  12. # All model specific things should be defined here instead of the base class.
  13. def create_module(self, config=None):
  14. _config = config if config is not None else self.ds_model_config
  15. self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
  16. self.module.config.scale_attention = self.scale_attention
  17. if self.megatron_v2:
  18. self.module.config.rotate_half = True
  19. self.module.config.rotate_every_two = False
  20. return self.module
  21. # TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
  22. # TODO: Generalize MoE overall goal, expand beyond Megatron
  23. class MegatronMoELayerPolicy(MegatronLayerPolicy):
  24. _orig_layer_class = None
  25. version = 0
  26. moe_type = 'standard'
  27. num_experts = 1
  28. def __init__(self, client_module, inference=True):
  29. super().__init__(inference)
  30. self.client_module = client_module
  31. # we use megatron version to differentiate between the old and new
  32. # megatron-lm source code
  33. if MegatronMoELayerPolicy._orig_layer_class is None:
  34. if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
  35. MegatronMoELayerPolicy._orig_layer_class = None
  36. else:
  37. try:
  38. from megatron.model.transformer import ParallelTransformerLayer
  39. MegatronMoELayerPolicy._orig_layer_class = ParallelTransformerLayer
  40. except ImportError:
  41. MegatronMoELayerPolicy._orig_layer_class = None
  42. def get_num_experts(self):
  43. return self.num_experts
  44. def mlp(self, moe_type='standard'):
  45. # for now, all of this is tightly coupled to megatron-deepspeed moe implementation
  46. # todo: think and refactor this to be more general
  47. #from deepspeed.moe.utils import has_moe_layers
  48. #moe, _ = has_moe_layers(self.client_module)
  49. moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
  50. self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
  51. num_experts = len(moe_experts)
  52. self.num_experts = num_experts
  53. if moe_type == 'standard':
  54. return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
  55. [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
  56. [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
  57. [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
  58. else:
  59. return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
  60. [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
  61. [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
  62. [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
  63. self.client_module.mlp.mlp.dense_h_to_4h.weight, \
  64. self.client_module.mlp.mlp.dense_h_to_4h.bias, \
  65. self.client_module.mlp.mlp.dense_4h_to_h.weight, \
  66. self.client_module.mlp.mlp.dense_4h_to_h.bias, \
  67. self.client_module.mlp.coefficient.weight