config.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import torch
  5. import deepspeed
  6. from pydantic import Field, field_validator
  7. from deepspeed.runtime.config_utils import DeepSpeedConfigModel
  8. from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
  9. from typing import Dict, Union, Optional
  10. from enum import Enum
  11. class DtypeEnum(Enum):
  12. fp16 = (torch.float16, "torch.float16", "fp16", "float16", "half")
  13. fp32 = (torch.float32, "torch.float32", "fp32", "float32", "float")
  14. bf16 = (torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat")
  15. int8 = (torch.int8, "torch.int8", "int8")
  16. @classmethod
  17. def from_str(cls, value: str):
  18. for dtype in cls:
  19. if value in dtype.value:
  20. return dtype
  21. raise ValueError(f"'{value}' is not a valid DtypeEnum")
  22. class MoETypeEnum(str, Enum):
  23. residual = "residual"
  24. standard = "standard"
  25. class DeepSpeedTPConfig(DeepSpeedConfigModel):
  26. """ Configure tensor parallelism settings """
  27. enabled: bool = True
  28. """ Turn tensor parallelism on/off. """
  29. tp_size: int = 1
  30. """ Number of devices to split the model across using tensor parallelism. """
  31. mpu: object = None
  32. """
  33. A model parallelism unit object that implements
  34. ``get_{model,data}_parallel_{rank,group,world_size}()``.
  35. """
  36. tp_group: object = None
  37. class DeepSpeedMoEConfig(DeepSpeedConfigModel):
  38. """ Sets parameters for MoE """
  39. enabled: bool = True
  40. ep_size: int = 1
  41. """
  42. The expert-parallelism size which is used for partitioning the experts
  43. across the GPUs in the expert-parallel group.
  44. """
  45. moe_experts: list = Field([1], alias="num_experts")
  46. """ The global number of experts used in an MoE layer. """
  47. type: MoETypeEnum = MoETypeEnum.standard
  48. """
  49. Specify the type of MoE layer. We have two types of MoE layer: 'Standard'
  50. and 'Residual'.
  51. """
  52. ep_mp_group: object = None
  53. ep_group: object = Field(None, alias="expert_group")
  54. class QuantTypeEnum(str, Enum):
  55. asym = "asymmetric"
  56. sym = "symmetric"
  57. class BaseQuantConfig(DeepSpeedConfigModel):
  58. enabled: bool = True
  59. num_bits: int = 8
  60. q_type: QuantTypeEnum = QuantTypeEnum.sym
  61. q_groups: int = 1
  62. class WeightQuantConfig(BaseQuantConfig):
  63. enabled: bool = True
  64. quantized_initialization: Dict = {}
  65. post_init_quant: Dict = {}
  66. class ActivationQuantConfig(BaseQuantConfig):
  67. enabled: bool = True
  68. class QKVQuantConfig(DeepSpeedConfigModel):
  69. enabled: bool = True
  70. class QuantizationConfig(DeepSpeedConfigModel):
  71. enabled: bool = True
  72. activation: ActivationQuantConfig = ActivationQuantConfig()
  73. weight: WeightQuantConfig = WeightQuantConfig()
  74. qkv: QKVQuantConfig = QKVQuantConfig()
  75. # todo: brainstorm on how to do ckpt loading for DS inference
  76. class InferenceCheckpointConfig(DeepSpeedConfigModel):
  77. checkpoint_dir: Optional[str] = None
  78. save_mp_checkpoint_path: Optional[str] = None
  79. base_dir: Optional[str] = None
  80. class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
  81. """ Sets parameters for DeepSpeed Inference Engine. """
  82. replace_with_kernel_inject: bool = Field(False, alias="kernel_inject")
  83. """
  84. Set to true to inject inference kernels for models such as, Bert, GPT2,
  85. GPT-Neo and GPT-J. Otherwise, the injection_dict provides the names of two
  86. linear layers as a tuple:
  87. `(attention_output projection, transformer output projection)`
  88. """
  89. dtype: torch.dtype = torch.float16
  90. """
  91. Desired model data type, will convert model to this type.
  92. Supported target types: `torch.half`, `torch.int8`, `torch.float`
  93. """
  94. tensor_parallel: DeepSpeedTPConfig = Field({}, alias="tp")
  95. """
  96. Configuration for tensor parallelism used to split the model across several
  97. GPUs. Expects a dictionary containing values for :any:`DeepSpeedTPConfig`.
  98. """
  99. enable_cuda_graph: bool = False
  100. """
  101. Use this flag for capturing the CUDA-Graph of the inference ops, so that it
  102. can run faster using the graph replay method.
  103. """
  104. use_triton: bool = False
  105. """
  106. Use this flag to use triton kernels for inference ops.
  107. """
  108. triton_autotune: bool = False
  109. """
  110. Use this flag to enable triton autotuning.
  111. Turning it on is better for performance but increase the 1st runtime for
  112. autotuning.
  113. """
  114. zero: DeepSpeedZeroConfig = {}
  115. """
  116. ZeRO configuration to use with the Inference Engine. Expects a dictionary
  117. containing values for :any:`DeepSpeedZeroConfig`.
  118. """
  119. triangular_masking: bool = Field(True, alias="tm")
  120. """
  121. Controls the type of masking for attention scores in transformer layer.
  122. Note that the masking is application specific.
  123. """
  124. moe: Union[bool, DeepSpeedMoEConfig] = {}
  125. """
  126. Specify if the type of Transformer is MoE. Expects a dictionary containing
  127. values for :any:`DeepSpeedMoEConfig`.
  128. """
  129. quant: QuantizationConfig = {}
  130. """
  131. NOTE: only works for int8 dtype.
  132. Quantization settings used for quantizing your model using the MoQ. The
  133. setting can be one element or a tuple. If one value is passed in, we
  134. consider it as the number of groups used in quantization. A tuple is passed
  135. in if we want to mention that there is extra-grouping for the MLP part of a
  136. Transformer layer (e.g. (True, 8) shows we quantize the model using 8
  137. groups for all the network except the MLP part that we use 8 extra
  138. grouping). Expects a dictionary containing values for
  139. :any:`QuantizationConfig`.
  140. """
  141. #todo: refactor the following 3 into the new checkpoint_config
  142. checkpoint: Optional[Union[str, Dict]] = None
  143. """
  144. Path to deepspeed compatible checkpoint or path to JSON with load policy.
  145. """
  146. base_dir: str = ""
  147. """
  148. This shows the root directory under which all the checkpoint files exists.
  149. This can be passed through the json config too.
  150. """
  151. set_empty_params: bool = False
  152. """
  153. specifying whether the inference-module is created with empty or real Tensor
  154. """
  155. save_mp_checkpoint_path: Optional[str] = None
  156. """
  157. The path for which we want to save the loaded model with a checkpoint. This
  158. feature is used for adjusting the parallelism degree to help alleviate the
  159. model loading overhead. It does not save any new checkpoint if no path is
  160. passed.
  161. """
  162. checkpoint_config: InferenceCheckpointConfig = Field({}, alias="ckpt_config")
  163. """
  164. TODO: Add docs. Expects a dictionary containing values for
  165. :any:`InferenceCheckpointConfig`.
  166. """
  167. return_tuple: bool = True
  168. """
  169. Specify whether or not the transformer layers need to return a tuple or a
  170. Tensor.
  171. """
  172. training_mp_size: int = 1
  173. """
  174. If loading a checkpoint this is the mp size that it was trained with, it
  175. may be different than what the mp size that you want to use during
  176. inference.
  177. """
  178. replace_method: str = Field(
  179. "auto",
  180. json_schema_extra={
  181. "deprecated": True,
  182. "deprecated_msg": "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
  183. })
  184. injection_policy: Optional[Dict] = Field(None, alias="injection_dict")
  185. """
  186. Dictionary mapping a client nn.Module to its corresponding injection
  187. policy. e.g., `{BertLayer : deepspeed.inference.HFBertLayerPolicy}`
  188. """
  189. injection_policy_tuple: Optional[tuple] = None
  190. """ TODO: Add docs """
  191. config: Optional[Dict] = Field(None, alias="args") # todo: really no need for this field if we can refactor
  192. max_out_tokens: int = Field(1024, alias="max_tokens")
  193. """
  194. This argument shows the maximum number of tokens inference-engine can work
  195. with, including the input and output tokens. Please consider increasing it
  196. to the required token-length required for your use-case.
  197. """
  198. min_out_tokens: int = Field(1, alias="min_tokens")
  199. """
  200. This argument communicates to the runtime the minimum number of tokens you
  201. expect you will need to generate. This will cause the runtime to error
  202. if it unable to provide this and provide context on the memory pressure
  203. rather than seg-faulting or providing corrupted output.
  204. """
  205. transposed_mode: bool = Field(False, alias="transposed_mode")
  206. mp_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.tp_size"})
  207. """
  208. Desired model parallel size, default is 1 meaning no model parallelism.
  209. Deprecated, please use the ``tensor_parallel` config to control model
  210. parallelism.
  211. """
  212. mpu: object = Field(None, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.mpu"})
  213. ep_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "moe.ep_size"})
  214. ep_group: object = Field(None,
  215. alias="expert_group",
  216. json_schema_extra={
  217. "deprecated": True,
  218. "new_param": "moe.ep_group"
  219. })
  220. ep_mp_group: object = Field(None,
  221. alias="expert_mp_group",
  222. json_schema_extra={
  223. "deprecated": True,
  224. "new_param": "moe.ep_mp_group"
  225. })
  226. moe_experts: list = Field([1], json_schema_extra={"deprecated": True, "new_param": "moe.moe_experts"})
  227. moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
  228. json_schema_extra={
  229. "deprecated": True,
  230. "new_param": "moe.type"
  231. })
  232. @field_validator("dtype", mode="before")
  233. def validate_dtype(cls, field_value, values):
  234. if isinstance(field_value, str):
  235. return DtypeEnum.from_str(field_value).value[0]
  236. if isinstance(field_value, torch.dtype):
  237. return field_value
  238. raise TypeError(f"Invalid type for dtype: {type(field_value)}")
  239. @field_validator("moe")
  240. def moe_backward_compat(cls, field_value, values):
  241. if isinstance(field_value, bool):
  242. return DeepSpeedMoEConfig(moe=field_value)
  243. return field_value
  244. @field_validator("use_triton")
  245. def has_triton(cls, field_value, values):
  246. if field_value and not deepspeed.HAS_TRITON:
  247. raise ValueError('Triton needs to be installed to use deepspeed with triton kernels')
  248. return field_value