config.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import json
  5. import torch
  6. from deepspeed.utils.types import ActivationFuncType, NormType
  7. class TransformerConfig():
  8. def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
  9. self.layer_id = -1
  10. self.hidden_size = hidden_size
  11. self.intermediate_size = intermediate_size
  12. self.heads = heads
  13. self.num_hidden_layers = num_hidden_layers
  14. class DeepSpeedInferenceConfig(TransformerConfig):
  15. """Initialize the DeepSpeed Transformer Config.
  16. Arguments:
  17. hidden_size: The hidden size of the transformer layer
  18. intermediate_size: The intermediate size of the feed-forward part of transformer layer
  19. heads: The number of heads in the self-attention of the transformer layer
  20. num_hidden_layers: The number of transformer layers
  21. layer_norm_eps: The epsilon value for the layer norm
  22. local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
  23. to use if the model already set the current device, otherwise need to set it
  24. so that the transformer kernel can work on the right device
  25. mp_size (optional): This argument is mainly used to create the parameters on the kernel side
  26. using model-parallel architecture. If the client model already takes care of this, there is no
  27. need to pass this argument.
  28. pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
  29. stochastic_mode: Enable for high performance, please note that this flag has some level of
  30. non-determinism and can produce different results on different runs. However, we have seen
  31. that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
  32. a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
  33. to turn it off in order to be able to reproduce the same result through the regular kernel execution.
  34. scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
  35. return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
  36. bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
  37. use_triton: This flag is to enable triton kernels in inference or not.
  38. """
  39. def __init__(self,
  40. hidden_size=-1,
  41. intermediate_size=-1,
  42. heads=-1,
  43. num_hidden_layers=-1,
  44. layer_norm_eps=1e-12,
  45. local_rank=-1,
  46. mp_size=1,
  47. dtype=torch.float16,
  48. pre_layer_norm=True,
  49. norm_type=NormType.LayerNorm,
  50. stochastic_mode=False,
  51. scale_attention=True,
  52. triangular_masking=True,
  53. local_attention=False,
  54. window_size=256,
  55. rotary_dim=-1,
  56. rotate_half=False,
  57. rotate_every_two=True,
  58. return_tuple=True,
  59. mlp_after_attn=True,
  60. mlp_act_func_type=ActivationFuncType.GELU,
  61. training_mp_size=1,
  62. bigscience_bloom=False,
  63. max_out_tokens=1024,
  64. min_out_tokens=1,
  65. enable_qkv_quantization=False,
  66. use_mup=False,
  67. scale_attn_by_inverse_layer_idx=False,
  68. return_single_tuple=False,
  69. set_empty_params=False,
  70. transposed_mode=False,
  71. use_triton=False,
  72. triton_autotune=False):
  73. super(DeepSpeedInferenceConfig,
  74. self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
  75. num_hidden_layers)
  76. self.dtype = dtype
  77. self.pre_layer_norm = pre_layer_norm
  78. self.norm_type = norm_type
  79. self.local_rank = local_rank
  80. self.stochastic_mode = stochastic_mode
  81. self.epsilon = layer_norm_eps
  82. self.mp_size = mp_size
  83. self.scale_attention = scale_attention
  84. self.triangular_masking = triangular_masking
  85. self.local_attention = local_attention
  86. self.window_size = window_size
  87. self.rotary_dim = rotary_dim
  88. self.rotate_half = rotate_half
  89. self.rotate_every_two = rotate_every_two
  90. self.return_tuple = return_tuple
  91. self.mlp_after_attn = mlp_after_attn
  92. self.mlp_act_func_type = mlp_act_func_type
  93. self.specialized_mode = False
  94. self.training_mp_size = training_mp_size
  95. self.bigscience_bloom = bigscience_bloom
  96. self.max_out_tokens = max_out_tokens
  97. self.min_out_tokens = min_out_tokens
  98. self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
  99. self.enable_qkv_quantization = enable_qkv_quantization
  100. self.use_mup = use_mup
  101. self.return_single_tuple = return_single_tuple
  102. self.set_empty_params = set_empty_params
  103. self.transposed_mode = transposed_mode
  104. self.use_triton = use_triton
  105. self.triton_autotune = triton_autotune
  106. @classmethod
  107. def from_dict(cls, json_object):
  108. config = DeepSpeedInferenceConfig()
  109. for key, value in json_object.items():
  110. config.__dict__[key] = value
  111. return config
  112. @classmethod
  113. def from_json_file(cls, json_file):
  114. with open(json_file, "r", encoding='utf-8') as reader:
  115. text = reader.read()
  116. return cls.from_dict(json.loads(text))