megatron_model.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import torch
  5. import os
  6. import sys
  7. import math
  8. from .common import get_test_path
  9. from deepspeed.pipe import PipelineModule, LayerSpec
  10. from deepspeed.accelerator import get_accelerator
  11. def get_megatron_version():
  12. p = os.popen("pip list --format=columns | grep megatron-lm")
  13. pip_list = p.read()
  14. assert 'megatron-lm' in pip_list, 'Please install Megatron-LM before getting its version'
  15. ver_str = pip_list.split()[1]
  16. return float(ver_str[0])
  17. def get_gpt2_model(args_others, mp_size=1):
  18. from megatron.model import GPT2Model
  19. from megatron.initialize import initialize_megatron
  20. args_defaults = {
  21. 'vocab_file': get_test_path('gpt2-vocab.json'),
  22. 'merge_file': get_test_path('gpt2-merges.txt'),
  23. 'tokenizer_type': 'GPT2BPETokenizer',
  24. }
  25. args_defaults.update(args_others)
  26. # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
  27. sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)])
  28. initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
  29. model = GPT2Model(num_tokentypes=0, parallel_output=False)
  30. model.to(get_accelerator().device_name())
  31. from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
  32. from megatron import mpu
  33. i = get_accelerator().current_device_name()
  34. model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group())
  35. return model
  36. class MockGPT2ModelPipe(PipelineModule):
  37. def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
  38. from megatron.initialize import initialize_megatron
  39. args_defaults = {
  40. 'vocab_file': get_test_path('gpt2-vocab.json'),
  41. 'merge_file': get_test_path('gpt2-merges.txt'),
  42. 'tokenizer_type': 'GPT2BPETokenizer',
  43. }
  44. args_defaults.update(args_others)
  45. # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
  46. sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)])
  47. initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
  48. from megatron.model.transformer import ParallelTransformerLayer
  49. class ParallelTransformerLayerPipe(ParallelTransformerLayer):
  50. def forward(self, args):
  51. # hardcode attn mask for testing, PP requires the attn_mask to be stashed
  52. attention_mask = torch.tensor([[True]], device=get_accelerator().current_device_name())
  53. return super().forward(args, attention_mask)
  54. layers = []
  55. for x in range(num_layers):
  56. layers.append(
  57. LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02),
  58. self.scaled_init_method_normal(0.02, num_layers), x))
  59. super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
  60. def gpt2_attention_mask_func(self, attention_scores, ltor_mask):
  61. attention_scores.masked_fill_(ltor_mask, -10000.0)
  62. return attention_scores
  63. def init_method_normal(self, sigma):
  64. """Init method based on N(0, sigma)."""
  65. def init_(tensor):
  66. return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
  67. return init_
  68. def scaled_init_method_normal(self, sigma, num_layers):
  69. """Init method based on N(0, sigma/sqrt(2*num_layers)."""
  70. std = sigma / math.sqrt(2.0 * num_layers)
  71. def init_(tensor):
  72. return torch.nn.init.normal_(tensor, mean=0.0, std=std)
  73. return init_