megatron_gpt_moe.py 3.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. from .base import *
  5. from .base_moe import *
  6. from .features.megatron import MegatronContainer
  7. from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
  8. import torch
  9. from .megatron_gpt import MegatronLayerPolicy
  10. from packaging import version as pkg_version
  11. class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
  12. def __init__(self, policy, config, model_config, layer_id):
  13. super().__init__(policy, config, model_config, layer_id)
  14. # All model specific things should be defined here instead of the base class.
  15. def create_module(self, config=None):
  16. _config = config if config is not None else self.ds_model_config
  17. self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
  18. self.module.config.scale_attention = self.scale_attention
  19. if self.megatron_v2:
  20. self.module.config.rotate_half = True
  21. self.module.config.rotate_every_two = False
  22. return self.module
  23. # TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
  24. # TODO: Generalize MoE overall goal, expand beyond Megatron
  25. class MegatronMoELayerPolicy(MegatronLayerPolicy):
  26. _orig_layer_class = None
  27. version = 0
  28. moe_type = 'standard'
  29. num_experts = 1
  30. def __init__(self, client_module, inference=True):
  31. super().__init__(inference)
  32. self.client_module = client_module
  33. # we use megatron version to differentiate between the old and new
  34. # megatron-lm source code
  35. if MegatronMoELayerPolicy._orig_layer_class is None:
  36. if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
  37. MegatronMoELayerPolicy._orig_layer_class = None
  38. else:
  39. try:
  40. from megatron.model.transformer import ParallelTransformerLayer
  41. MegatronMoELayerPolicy._orig_layer_class = ParallelTransformerLayer
  42. except ImportError:
  43. MegatronMoELayerPolicy._orig_layer_class = None
  44. def get_num_experts(self):
  45. return self.num_experts
  46. def mlp(self, moe_type='standard', enable_training=False):
  47. # for now, all of this is tightly coupled to megatron-deepspeed moe implementation
  48. # todo: think and refactor this to be more general
  49. #from deepspeed.moe.utils import has_moe_layers
  50. #moe, _ = has_moe_layers(self.client_module)
  51. moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
  52. self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
  53. num_experts = len(moe_experts)
  54. self.num_experts = num_experts
  55. if moe_type == 'standard':
  56. return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
  57. [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
  58. [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
  59. [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
  60. else:
  61. return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
  62. [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
  63. [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
  64. [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
  65. self.client_module.mlp.mlp.dense_h_to_4h.weight, \
  66. self.client_module.mlp.mlp.dense_h_to_4h.bias, \
  67. self.client_module.mlp.mlp.dense_4h_to_h.weight, \
  68. self.client_module.mlp.mlp.dense_4h_to_h.bias, \
  69. self.client_module.mlp.coefficient.weight