megatron_model.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. import torch
  2. import os
  3. import sys
  4. import math
  5. from .common import get_test_path
  6. from deepspeed.pipe import PipelineModule, LayerSpec
  7. def get_megatron_version():
  8. p = os.popen("pip list --format=columns | grep megatron-lm")
  9. pip_list = p.read()
  10. assert 'megatron-lm' in pip_list, 'Please install Megatron-LM before getting its version'
  11. ver_str = pip_list.split()[1]
  12. return float(ver_str[0])
  13. def get_gpt2_model(args_others, mp_size=1):
  14. from megatron.model import GPT2Model
  15. from megatron.initialize import initialize_megatron
  16. args_defaults = {
  17. 'vocab_file': get_test_path('gpt2-vocab.json'),
  18. 'merge_file': get_test_path('gpt2-merges.txt'),
  19. 'tokenizer_type': 'GPT2BPETokenizer',
  20. }
  21. args_defaults.update(args_others)
  22. # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
  23. sys.argv.extend([
  24. '--model-parallel-size',
  25. str(mp_size),
  26. '--make-vocab-size-divisible-by',
  27. str(1)
  28. ])
  29. initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
  30. model = GPT2Model(num_tokentypes=0, parallel_output=False)
  31. model.cuda()
  32. from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
  33. from megatron import mpu
  34. i = torch.cuda.current_device()
  35. model = torchDDP(model,
  36. device_ids=[i],
  37. output_device=i,
  38. process_group=mpu.get_data_parallel_group())
  39. return model
  40. class MockGPT2ModelPipe(PipelineModule):
  41. def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
  42. from megatron.initialize import initialize_megatron
  43. args_defaults = {
  44. 'vocab_file': get_test_path('gpt2-vocab.json'),
  45. 'merge_file': get_test_path('gpt2-merges.txt'),
  46. 'tokenizer_type': 'GPT2BPETokenizer',
  47. }
  48. args_defaults.update(args_others)
  49. # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
  50. sys.argv.extend([
  51. '--model-parallel-size',
  52. str(mp_size),
  53. '--make-vocab-size-divisible-by',
  54. str(1)
  55. ])
  56. initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
  57. from megatron.model.transformer import ParallelTransformerLayer
  58. class ParallelTransformerLayerPipe(ParallelTransformerLayer):
  59. def forward(self, args):
  60. # hardcode attn mask for testing, PP requires the attn_mask to be stashed
  61. attention_mask = torch.tensor([[True]],
  62. device=torch.cuda.current_device())
  63. return super().forward(args, attention_mask)
  64. layers = []
  65. for x in range(num_layers):
  66. layers.append(
  67. LayerSpec(ParallelTransformerLayerPipe,
  68. self.gpt2_attention_mask_func,
  69. self.init_method_normal(0.02),
  70. self.scaled_init_method_normal(0.02,
  71. num_layers),
  72. x))
  73. super().__init__(layers=layers,
  74. loss_fn=torch.nn.CrossEntropyLoss(),
  75. topology=topo,
  76. **kwargs)
  77. def gpt2_attention_mask_func(self, attention_scores, ltor_mask):
  78. attention_scores.masked_fill_(ltor_mask, -10000.0)
  79. return attention_scores
  80. def init_method_normal(self, sigma):
  81. """Init method based on N(0, sigma)."""
  82. def init_(tensor):
  83. return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
  84. return init_
  85. def scaled_init_method_normal(self, sigma, num_layers):
  86. """Init method based on N(0, sigma/sqrt(2*num_layers)."""
  87. std = sigma / math.sqrt(2.0 * num_layers)
  88. def init_(tensor):
  89. return torch.nn.init.normal_(tensor, mean=0.0, std=std)
  90. return init_