config.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. '''Copyright The Microsoft DeepSpeed Team'''
  2. import torch
  3. from deepspeed.runtime.config_utils import DeepSpeedConfigModel
  4. from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
  5. from pydantic import Field
  6. from pydantic import validator
  7. from typing import Dict, Union
  8. from enum import Enum
  9. class DtypeEnum(Enum):
  10. # The torch dtype must always be the first value (so we return torch.dtype)
  11. fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
  12. bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
  13. fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
  14. int8 = torch.int8, "torch.int8", "int8"
  15. # Copied from https://stackoverflow.com/a/43210118
  16. # Allows us to use multiple values for each Enum index and returns first
  17. # listed value when Enum is called
  18. def __new__(cls, *values):
  19. obj = object.__new__(cls)
  20. # first value is canonical value
  21. obj._value_ = values[0]
  22. for other_value in values[1:]:
  23. cls._value2member_map_[other_value] = obj
  24. obj._all_values = values
  25. return obj
  26. def __repr__(self):
  27. return "<%s.%s: %s>" % (
  28. self.__class__.__name__,
  29. self._name_,
  30. ", ".join([repr(v) for v in self._all_values]),
  31. )
  32. class MoETypeEnum(str, Enum):
  33. residual = "residual"
  34. standard = "standard"
  35. class DeepSpeedTPConfig(DeepSpeedConfigModel):
  36. """ Configure tensor parallelism settings """
  37. enabled: bool = True
  38. """ Turn tensor parallelism on/off. """
  39. tp_size: int = 1
  40. """ Number of devices to split the model across using tensor parallelism. """
  41. mpu: object = None
  42. """
  43. A model parallelism unit object that implements
  44. ``get_{model,data}_parallel_{rank,group,world_size}()``.
  45. """
  46. tp_group: object = None
  47. class DeepSpeedMoEConfig(DeepSpeedConfigModel):
  48. """ Sets parameters for MoE """
  49. enabled: bool = True
  50. ep_size: int = 1
  51. """
  52. The expert-parallelism size which is used for partitioning the experts
  53. across the GPUs in the expert-parallel group.
  54. """
  55. moe_experts: list = Field([1], alias="num_experts")
  56. """ The global number of experts used in an MoE layer. """
  57. type: MoETypeEnum = MoETypeEnum.standard
  58. """
  59. Specify the type of MoE layer. We have two types of MoE layer: 'Standard'
  60. and 'Residual'.
  61. """
  62. ep_mp_group: object = None
  63. ep_group: object = Field(None, alias="expert_group")
  64. class QuantTypeEnum(str, Enum):
  65. asym = "asymmetric"
  66. sym = "symmetric"
  67. class BaseQuantConfig(DeepSpeedConfigModel):
  68. enabled = True
  69. num_bits = 8
  70. q_type: QuantTypeEnum = QuantTypeEnum.sym
  71. q_groups: int = 1
  72. class WeightQuantConfig(BaseQuantConfig):
  73. enabled = True
  74. class ActivationQuantConfig(BaseQuantConfig):
  75. enabled = True
  76. class QKVQuantConfig(DeepSpeedConfigModel):
  77. enabled = True
  78. class QuantizationConfig(DeepSpeedConfigModel):
  79. enabled: bool = True
  80. activation: ActivationQuantConfig = ActivationQuantConfig()
  81. weight: WeightQuantConfig = WeightQuantConfig()
  82. qkv: QKVQuantConfig = QKVQuantConfig()
  83. # todo: brainstorm on how to do ckpt loading for DS inference
  84. class InferenceCheckpointConfig(DeepSpeedConfigModel):
  85. checkpoint_dir: str = None
  86. save_mp_checkpoint_path: str = None
  87. base_dir: str = None
  88. class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
  89. """ Sets parameters for DeepSpeed Inference Engine. """
  90. replace_with_kernel_inject: bool = Field(False, alias="kernel_inject")
  91. """
  92. Set to true to inject inference kernels for models such as, Bert, GPT2,
  93. GPT-Neo and GPT-J. Otherwise, the injection_dict provides the names of two
  94. linear layers as a tuple:
  95. `(attention_output projection, transformer output projection)`
  96. """
  97. dtype: DtypeEnum = torch.float16
  98. """
  99. Desired model data type, will convert model to this type.
  100. Supported target types: `torch.half`, `torch.int8`, `torch.float`
  101. """
  102. tensor_parallel: DeepSpeedTPConfig = Field({}, alias="tp")
  103. """
  104. Configuration for tensor parallelism used to split the model across several
  105. GPUs. Expects a dictionary containing values for :any:`DeepSpeedTPConfig`.
  106. """
  107. enable_cuda_graph: bool = False
  108. """
  109. Use this flag for capturing the CUDA-Graph of the inference ops, so that it
  110. can run faster using the graph replay method.
  111. """
  112. zero: DeepSpeedZeroConfig = {}
  113. """
  114. ZeRO configuration to use with the Inference Engine. Expects a dictionary
  115. containing values for :any:`DeepSpeedZeroConfig`.
  116. """
  117. triangular_masking: bool = Field(True, alias="tm")
  118. """
  119. Controls the type of masking for attention scores in transformer layer.
  120. Note that the masking is application specific.
  121. """
  122. moe: Union[bool, DeepSpeedMoEConfig] = {}
  123. """
  124. Specify if the type of Transformer is MoE. Expects a dictionary containing
  125. values for :any:`DeepSpeedMoEConfig`.
  126. """
  127. quant: QuantizationConfig = {}
  128. """
  129. NOTE: only works for int8 dtype.
  130. Quantization settings used for quantizing your model using the MoQ. The
  131. setting can be one element or a tuple. If one value is passed in, we
  132. consider it as the number of groups used in quantization. A tuple is passed
  133. in if we want to mention that there is extra-grouping for the MLP part of a
  134. Transformer layer (e.g. (True, 8) shows we quantize the model using 8
  135. groups for all the network except the MLP part that we use 8 extra
  136. grouping). Expects a dictionary containing values for
  137. :any:`QuantizationConfig`.
  138. """
  139. #todo: refactor the following 3 into the new checkpoint_config
  140. checkpoint: str = None
  141. """
  142. Path to deepspeed compatible checkpoint or path to JSON with load policy.
  143. """
  144. base_dir: str = None
  145. """
  146. This shows the root directory under which all the checkpoint files exists.
  147. This can be passed through the json config too.
  148. """
  149. save_mp_checkpoint_path: str = None
  150. """
  151. The path for which we want to save the loaded model with a checkpoint. This
  152. feature is used for adjusting the parallelism degree to help alleviate the
  153. model loading overhead. It does not save any new checkpoint if no path is
  154. passed.
  155. """
  156. checkpoint_config: InferenceCheckpointConfig = Field({}, alias="ckpt_config")
  157. """
  158. TODO: Add docs. Expects a dictionary containing values for
  159. :any:`InferenceCheckpointConfig`.
  160. """
  161. return_tuple: bool = True
  162. """
  163. Specify whether or not the transformer layers need to return a tuple or a
  164. Tensor.
  165. """
  166. training_mp_size: int = 1
  167. """
  168. If loading a checkpoint this is the mp size that it was trained with, it
  169. may be different than what the mp size that you want to use during
  170. inference.
  171. """
  172. replace_method: str = Field(
  173. "auto",
  174. deprecated=True,
  175. deprecated_msg=
  176. "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
  177. )
  178. injection_policy: Dict = Field(None, alias="injection_dict")
  179. """
  180. Dictionary mapping a client nn.Module to its corresponding injection
  181. policy. e.g., `{BertLayer : deepspeed.inference.HFBertLayerPolicy}`
  182. """
  183. injection_policy_tuple: tuple = None
  184. """ TODO: Add docs """
  185. config: Dict = Field(
  186. None,
  187. alias="args") # todo: really no need for this field if we can refactor
  188. max_out_tokens: int = Field(1024, alias="max_tokens")
  189. """
  190. This argument shows the maximum number of tokens inference-engine can work
  191. with, including the input and output tokens. Please consider increasing it
  192. to the required token-length required for your use-case.
  193. """
  194. mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size")
  195. """
  196. Desired model parallel size, default is 1 meaning no model parallelism.
  197. Deprecated, please use the ``tensor_parallel` config to control model
  198. parallelism.
  199. """
  200. mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu")
  201. ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size")
  202. ep_group: object = Field(None,
  203. alias="expert_group",
  204. deprecated=True,
  205. new_param="moe.ep_group")
  206. ep_mp_group: object = Field(None,
  207. alias="expert_mp_group",
  208. deprecated=True,
  209. new_param="moe.ep_mp_group")
  210. moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
  211. moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
  212. deprecated=True,
  213. new_param="moe.type")
  214. @validator("moe")
  215. def moe_backward_compat(cls, field_value, values):
  216. if isinstance(field_value, bool):
  217. return DeepSpeedMoEConfig(moe=field_value)
  218. return field_value
  219. class Config:
  220. # Get the str representation of the datatype for serialization
  221. json_encoders = {torch.dtype: lambda x: str(x)}