layer.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. '''
  2. Copyright 2020 The Microsoft DeepSpeed Team
  3. '''
  4. import torch.nn.init as init
  5. import torch
  6. import torch.distributed as dist
  7. from deepspeed.utils import logger, log_dist
  8. import deepspeed.utils.groups as groups
  9. from .sharded_moe import MOELayer, TopKGate
  10. from .experts import Experts
  11. import copy
  12. import typing
  13. class MoE(torch.nn.Module):
  14. def __init__(self,
  15. hidden_size,
  16. expert,
  17. num_experts=1,
  18. k=1,
  19. capacity_factor=1.,
  20. eval_capacity_factor=1.,
  21. min_capacity=4,
  22. noisy_gate_policy: typing.Optional[str] = None,
  23. drop_tokens: bool = True,
  24. use_rts=True,
  25. use_tutel: bool = False):
  26. """Initialize an MoE layer.
  27. Arguments:
  28. hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
  29. expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
  30. num_experts (int, optional): default=1, the total number of experts per layer.
  31. k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
  32. capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
  33. eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
  34. min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
  35. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
  36. drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
  37. use_rts (bool, optional): default=True, whether to use Random Token Selection.
  38. use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
  39. """
  40. super(MoE, self).__init__()
  41. assert groups.is_initialized(), \
  42. 'Please call deepspeed.utils.groups.initialize() before using MoE layers'
  43. assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
  44. 'Unsupported noisy_gate_policy: ' + noisy_gate_policy
  45. num_local_experts = num_experts // groups.get_expert_parallel_world_size()
  46. log_dist(
  47. f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}',
  48. [0])
  49. self.num_experts = num_experts
  50. experts = Experts(expert, num_local_experts)
  51. self.deepspeed_moe = MOELayer(TopKGate(hidden_size,
  52. num_experts,
  53. k,
  54. capacity_factor,
  55. eval_capacity_factor,
  56. min_capacity,
  57. noisy_gate_policy,
  58. drop_tokens,
  59. use_rts),
  60. experts,
  61. num_local_experts,
  62. group=groups.get_expert_parallel_group(),
  63. use_tutel=use_tutel)
  64. def forward(self, hidden_states, used_token=None):
  65. """ MoE forward
  66. Arguments:
  67. hidden_states (Tensor): input to the layer
  68. used_token (Tensor, optional): default: None, mask only used tokens
  69. Returns:
  70. A tuple including output, gate loss, and expert count.
  71. * output (Tensor): output of the model
  72. * l_aux (Tensor): gate loss value
  73. * exp_counts (int): expert count
  74. """
  75. output = self.deepspeed_moe(hidden_states, used_token)
  76. return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts