constants.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. #############################################
  5. # Routes
  6. #############################################
  7. ROUTE_TRAIN = "train"
  8. ROUTE_EVAL = "eval"
  9. ROUTE_PREDICT = "predict"
  10. ROUTE_ENCODE = "encode"
  11. #############################################
  12. # Batch size
  13. #############################################
  14. TRAIN_BATCH_SIZE = "train_batch_size"
  15. TRAIN_BATCH_SIZE_DEFAULT = None
  16. #############################################
  17. # Sparse attention
  18. #############################################
  19. SPARSE_ATTENTION = "sparse_attention"
  20. SPARSE_DENSE_MODE = "dense"
  21. SPARSE_FIXED_MODE = "fixed"
  22. SPARSE_VARIABLE_MODE = "variable"
  23. SPARSE_BIGBIRD_MODE = "bigbird"
  24. SPARSE_BSLONGFORMER_MODE = "bslongformer"
  25. SPARSE_MODE = "mode"
  26. SPARSE_MODE_DEFAULT = SPARSE_FIXED_MODE
  27. SPARSE_BLOCK = "block"
  28. SPARSE_BLOCK_DEFAULT = 16
  29. SPARSE_DIFFERENT_LAYOUT_PER_HEAD = "different_layout_per_head"
  30. SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT = False
  31. SPARSE_NUM_LOCAL_BLOCKS = "num_local_blocks"
  32. SPARSE_NUM_LOCAL_BLOCKS_DEFAULT = 4
  33. SPARSE_NUM_GLOBAL_BLOCKS = "num_global_blocks"
  34. SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT = 1
  35. SPARSE_ATTENTION_TYPE = "attention"
  36. SPARSE_ATTENTION_TYPE_DEFAULT = "bidirectional"
  37. SPARSE_HORIZONTAL_GLOBAL_ATTENTION = "horizontal_global_attention"
  38. SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT = False
  39. SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS = "num_different_global_patterns"
  40. SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT = 1
  41. SPARSE_NUM_RANDOM_BLOCKS = "num_random_blocks"
  42. SPARSE_NUM_RANDOM_BLOCKS_DEFAULT = 0
  43. SPARSE_LOCAL_WINDOW_BLOCKS = "local_window_blocks"
  44. SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT = [4]
  45. SPARSE_GLOBAL_BLOCK_INDICES = "global_block_indices"
  46. SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT = [0]
  47. SPARSE_GLOBAL_BLOCK_END_INDICES = "global_block_end_indices"
  48. SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT = None
  49. SPARSE_NUM_SLIDING_WINDOW_BLOCKS = "num_sliding_window_blocks"
  50. SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT = 3
  51. #############################################
  52. # Optimizer and lr scheduler
  53. #############################################
  54. OPTIMIZER = "optimizer"
  55. OPTIMIZER_TYPE_DEFAULT = None
  56. OPTIMIZER_PARAMS = "params"
  57. TYPE = "type"
  58. LEGACY_FUSION = "legacy_fusion"
  59. LEGACY_FUSION_DEFAULT = False
  60. SCHEDULER = "scheduler"
  61. SCHEDULER_TYPE_DEFAULT = None
  62. SCHEDULER_PARAMS = "params"
  63. MAX_GRAD_NORM = 'max_grad_norm'
  64. #############################################
  65. # Optimizer and lr scheduler
  66. #############################################
  67. ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
  68. ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
  69. ZERO_FORCE_DS_CPU_OPTIMIZER = "zero_force_ds_cpu_optimizer"
  70. ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT = True
  71. # Steps
  72. STEPS_PER_PRINT = "steps_per_print"
  73. STEPS_PER_PRINT_DEFAULT = 10
  74. #########################################
  75. # Training micro batch size per GPU
  76. #########################################
  77. # Batch size for one training step. This is used when the
  78. # TRAIN_BATCH_SIZE cannot fit in GPU memory to determine
  79. # the number of gradient accumulation steps. By default, this
  80. # is set to None. Users can configure in ds_config.json as below example:
  81. TRAIN_MICRO_BATCH_SIZE_PER_GPU = '''
  82. TRAIN_MICRO_BATCH_SIZE_PER_GPU is defined in this format:
  83. "train_micro_batch_size_per_gpu": 1
  84. '''
  85. TRAIN_MICRO_BATCH_SIZE_PER_GPU = "train_micro_batch_size_per_gpu"
  86. TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = None
  87. #########################################
  88. # Gradient Accumulation
  89. #########################################
  90. # Gradient accumulation feature. By default, this feature is not enabled.
  91. # Users can configure in ds_config.json as below example:
  92. GRADIENT_ACCUMULATION_FORMAT = '''
  93. Gradient Accumulation should be of the format:
  94. "gradient_accumulation_steps": 1
  95. '''
  96. GRADIENT_ACCUMULATION_STEPS = "gradient_accumulation_steps"
  97. GRADIENT_ACCUMULATION_STEPS_DEFAULT = None
  98. # DeepSpeed CSR gradient sparsity
  99. SPARSE_GRADIENTS = "sparse_gradients"
  100. SPARSE_GRADIENTS_DEFAULT = False
  101. #########################################
  102. # BFLOAT16 support
  103. #########################################
  104. # BFLOAT16 feature. By default, this feature is not enabled.
  105. # Users can configure in ds_config.json as below example:
  106. BFLOAT16_FORMAT = '''
  107. BFLOAT16 parameters should be of the format:
  108. "bf16": {
  109. "enabled": true
  110. }
  111. '''
  112. BFLOAT16 = "bf16"
  113. BFLOAT16_OLD = "bfloat16" # keeping for backwards compatibility
  114. BFLOAT16_ENABLED = "enabled"
  115. BFLOAT16_ENABLED_DEFAULT = False
  116. # BFLOAT16 optimizer immediate gradient update
  117. BFLOAT16_IMMEDIATE_GRAD_UPDATE = "immediate_grad_update"
  118. BFLOAT16_IMMEDIATE_GRAD_UPDATE_DEFAULT = False
  119. #########################################
  120. # FP16 support
  121. #########################################
  122. # FP16 feature. By default, this feature is not enabled.
  123. # Users can configure in ds_config.json as below example:
  124. FP16_FORMAT = '''
  125. FP16 parameters should be of the format:
  126. "fp16": {
  127. "enabled": true,
  128. "auto_cast": false,
  129. "loss_scale": 0,
  130. "initial_scale_power": 16,
  131. "loss_scale_window": 1000,
  132. "hysteresis": 2,
  133. "consecutive_hysteresis": false,
  134. "min_loss_scale": 1
  135. }
  136. '''
  137. FP16 = "fp16"
  138. FP16_ENABLED = "enabled"
  139. FP16_ENABLED_DEFAULT = False
  140. # FP16 loss scale, zero means using dynamic scaling
  141. FP16_LOSS_SCALE = "loss_scale"
  142. FP16_LOSS_SCALE_DEFAULT = 0
  143. FP16_AUTO_CAST = "auto_cast"
  144. FP16_AUTO_CAST_DEFAULT = False
  145. # FP16 initial dynamic scale loss power
  146. FP16_INITIAL_SCALE_POWER = "initial_scale_power"
  147. FP16_INITIAL_SCALE_POWER_DEFAULT = 16
  148. # FP16 loss scale window
  149. FP16_LOSS_SCALE_WINDOW = "loss_scale_window"
  150. FP16_LOSS_SCALE_WINDOW_DEFAULT = 1000
  151. # FP16 hysteresis
  152. FP16_HYSTERESIS = "hysteresis"
  153. FP16_HYSTERESIS_DEFAULT = 2
  154. # FP16 consecutive hysteresis
  155. FP16_CONSECUTIVE_HYSTERESIS = "consecutive_hysteresis"
  156. FP16_CONSECUTIVE_HYSTERESIS_DEFAULT = False
  157. # FP16 min loss scale
  158. FP16_MIN_LOSS_SCALE = "min_loss_scale"
  159. FP16_MIN_LOSS_SCALE_DEFAULT = 1
  160. # FP16 master and grads
  161. FP16_MASTER_WEIGHTS_AND_GRADS = "fp16_master_weights_and_grads"
  162. FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT = False
  163. #########################################
  164. # Apex AMP support
  165. #########################################
  166. # Use Apex AMP for mixed precision support, all parameters (other than 'enabled') will be passed to
  167. # amp.initialize(model, optimizer, **amp_params)
  168. # See apex documentation for supported parameters/features: https://nvidia.github.io/apex/amp.html#apex.amp.initialize
  169. AMP_FORMAT = '''
  170. "amp" {
  171. "enabled: true,
  172. "opt_level": "O1",
  173. ...
  174. }
  175. '''
  176. AMP = "amp"
  177. AMP_ENABLED = "enabled"
  178. AMP_ENABLED_DEFAULT = False
  179. #########################################
  180. # Gradient clipping
  181. #########################################
  182. # Gradient clipping. By default, this feature is not enabled.
  183. # Users can configure in ds_config.json as below example:
  184. GRADIENT_CLIPPING_FORMAT = '''
  185. Gradient clipping should be enabled as:
  186. "gradient_clipping": 1.0
  187. '''
  188. GRADIENT_CLIPPING = 'gradient_clipping'
  189. GRADIENT_CLIPPING_DEFAULT = 0.
  190. #########################################
  191. # Capture graph for short kernels sequences
  192. #########################################
  193. # Graph harvesting. By default, this feature is not enabled.
  194. # Users can configure in ds_config.json as below example:
  195. GRAPH_HARVESTING_FORMAT = '''
  196. Graph harvesting should be enabled as:
  197. "graph_harvesting": true
  198. '''
  199. GRAPH_HARVESTING = 'graph_harvesting'
  200. GRAPH_HARVESTING_DEFAULT = False
  201. #########################################
  202. # Communication data type
  203. #########################################
  204. # Supported types: ['none', 'fp16', 'fp32']
  205. # By default, this feature is not enabled ('none' value)
  206. # Users can configure in ds_config.json as below example:
  207. COMMUNICATION_DATA_TYPE_FORMAT = '''
  208. Communication data type should be set as:
  209. "communication_data_type": "fp32"
  210. '''
  211. COMMUNICATION_DATA_TYPE = "communication_data_type"
  212. COMMUNICATION_DATA_TYPE_DEFAULT = None
  213. ###########################################################
  214. # Gradient communication data type for sequence parallelism
  215. ###########################################################
  216. # Supported types: ['fp16', 'bf16','fp32']
  217. # Default value is fp32
  218. # Users can configure in ds_config.json as below example:
  219. SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_FORMAT = '''
  220. Optional comm data type for seq paralleism should be set as:
  221. "seq_parallel_communication_data_type": "fp32"
  222. '''
  223. SEQ_PARALLEL_COMMUNICATION_DATA_TYPE = "seq_parallel_comm_data_type"
  224. SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULT = "fp32"
  225. #########################################
  226. # Scale/predivide gradients before allreduce
  227. #########################################
  228. # Prescale gradients. By default, this feature is not enabled.
  229. # Users can configure in ds_config.json as below example:
  230. PRESCALE_GRADIENTS_FORMAT = '''
  231. Gradient prescaling should be enabled as:
  232. "prescale_gradients": true
  233. '''
  234. PRESCALE_GRADIENTS = "prescale_gradients"
  235. PRESCALE_GRADIENTS_DEFAULT = False
  236. GRADIENT_PREDIVIDE_FACTOR_FORMAT = '''
  237. Gradient predivide factor should be enabled as:
  238. "gradient_predivide_factor": 1.0
  239. '''
  240. GRADIENT_PREDIVIDE_FACTOR = "gradient_predivide_factor"
  241. GRADIENT_PREDIVIDE_FACTOR_DEFAULT = 1.0
  242. #########################################
  243. # Disable AllGather
  244. #########################################
  245. # Disable AllGather. By default, this feature is not enabled.
  246. # Users can configure in ds_config.json as below example:
  247. DISABLE_ALLGATHER_FORMAT = '''
  248. Disable AllGather should be enabled as:
  249. "disable_allgather": true
  250. '''
  251. DISABLE_ALLGATHER = "disable_allgather"
  252. DISABLE_ALLGATHER_DEFAULT = False
  253. #########################################
  254. # Dump DeepSpeed state
  255. #########################################
  256. # Dump State. By default, this feature is not enabled.
  257. # Users can configure in ds_config.json as below example:
  258. DUMP_STATE_FORMAT = '''
  259. Dump state should be enabled as:
  260. "dump_state": true
  261. '''
  262. DUMP_STATE = 'dump_state'
  263. DUMP_STATE_DEFAULT = False
  264. #########################################
  265. # Vocabulary size
  266. #########################################
  267. # Vocabulary size.
  268. # Users can configure in ds_config.json as below example:
  269. VOCABULARY_SIZE_FORMAT = '''
  270. Vocabulary size can be specified as:
  271. "vocabulary_size": 1024
  272. '''
  273. VOCABULARY_SIZE = 'vocabulary_size'
  274. VOCABULARY_SIZE_DEFAULT = None
  275. #########################################
  276. # Wall block breakdown
  277. #########################################
  278. # Wall clock breakdown. By default, this feature is not enabled.
  279. # Users can configure in ds_config.json as below example:
  280. WALL_CLOCK_BREAKDOWN_FORMAT = '''
  281. Wall block breakdown should be enabled as:
  282. "wall_clock_breakdown": true
  283. '''
  284. WALL_CLOCK_BREAKDOWN = 'wall_clock_breakdown'
  285. WALL_CLOCK_BREAKDOWN_DEFAULT = False
  286. MEMORY_BREAKDOWN = 'memory_breakdown'
  287. MEMORY_BREAKDOWN_DEFAULT = False
  288. #########################################
  289. # Eigenvalue
  290. #########################################
  291. # Eigenvalue computation. By default, this feature is not enabled.
  292. # Users can configure in ds_config.json as below example:
  293. EIGENVALUE_FORMAT = '''
  294. Tensorboard can be specified as:
  295. "eigenvalue": {
  296. "enabled": true,
  297. "verbose": true,
  298. "max_iter": 100,
  299. "tol": 1e-2,
  300. "stability": 1e-6
  301. }
  302. '''
  303. EIGENVALUE = "eigenvalue"
  304. # Tensorboard enable signal
  305. EIGENVALUE_ENABLED = "enabled"
  306. EIGENVALUE_ENABLED_DEFAULT = False
  307. EIGENVALUE_VERBOSE = "verbose"
  308. EIGENVALUE_VERBOSE_DEFAULT = False
  309. EIGENVALUE_MAX_ITER = "max_iter"
  310. EIGENVALUE_MAX_ITER_DEFAULT = 100
  311. EIGENVALUE_TOL = "tol"
  312. EIGENVALUE_TOL_DEFAULT = 1e-2
  313. EIGENVALUE_STABILITY = "stability"
  314. EIGENVALUE_STABILITY_DEFAULT = 1e-6
  315. EIGENVALUE_GAS_BOUNDARY_RESOLUTION = "gas_boundary_resolution"
  316. EIGENVALUE_GAS_BOUNDARY_RESOLUTION_DEFAULT = 1
  317. EIGENVALUE_LAYER_NAME = "layer_name"
  318. EIGENVALUE_LAYER_NAME_DEFAULT = "bert.encoder.layer"
  319. EIGENVALUE_LAYER_NUM = "layer_num"
  320. EIGENVALUE_LAYER_NUM_DEFAULT = 0
  321. #########################################
  322. # Progressive Layer Drop (PLD)
  323. #########################################
  324. PROGRESSIVE_LAYER_DROP = "progressive_layer_drop"
  325. # PLD enable signal
  326. PLD_ENABLED = "enabled"
  327. PLD_ENABLED_DEFAULT = False
  328. PLD_THETA = "theta"
  329. PLD_THETA_DEFAULT = 1.0
  330. PLD_GAMMA = "gamma"
  331. PLD_GAMMA_DEFAULT = 0.001
  332. #########################################
  333. # Validation modes
  334. #########################################
  335. class ValidationMode:
  336. WARN = "WARN"
  337. IGNORE = "IGNORE"
  338. FAIL = "FAIL"
  339. #########################################
  340. # Checkpoint config params
  341. #########################################
  342. # "checkpoint": {
  343. # tag_validation=["Ignore"|"Warn"|"Fail"]
  344. # load_universal=false
  345. # use_node_local_storage=false
  346. # parallel_write: {
  347. # pipeline_stage: [True|False]
  348. # }
  349. # }
  350. CHECKPOINT = "checkpoint"
  351. CHECKPOINT_TAG_VALIDATION = "tag_validation"
  352. CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
  353. CHECKPOINT_TAG_VALIDATION_MODES = [ValidationMode.WARN, ValidationMode.IGNORE, ValidationMode.FAIL]
  354. LOAD_UNIVERSAL_CHECKPOINT = "load_universal"
  355. LOAD_UNIVERSAL_CHECKPOINT_DEFAULT = False
  356. USE_NODE_LOCAL_STORAGE_CHECKPOINT = "use_node_local_storage"
  357. USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT = False
  358. CHECKPOINT_PARALLEL_WRITE = "parallel_write"
  359. CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE = "pipeline_stage"
  360. CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT = False
  361. #########################################
  362. # Data types config params
  363. #########################################
  364. # "data_types": {
  365. # grad_accum_dtype=["bf16"|"fp16"|"fp32"]
  366. # }
  367. # }
  368. DATA_TYPES = "data_types"
  369. GRAD_ACCUM_DTYPE = "grad_accum_dtype"
  370. GRAD_ACCUM_DTYPE_DEFAULT = None
  371. #########################################
  372. # Drop the last incomplete Batch
  373. # #########################################
  374. # dataloader_drop_last. By default, this feature is not enabled.
  375. # Users can configure in ds_config.json as below example:
  376. DATALOADER_DROP_LAST_FORMAT = '''
  377. The last incomplete batch can be dropped by setting:
  378. "dataloader_drop_last": True
  379. '''
  380. DATALOADER_DROP_LAST = "dataloader_drop_last"
  381. DATALOADER_DROP_LAST_DEFAULT = False
  382. #########################################
  383. # PIPELINE PARALLELISM
  384. #########################################
  385. PIPE_REPLICATED = 'ds_pipe_replicated'
  386. #########################################
  387. # DATA PARALLELISM
  388. #########################################
  389. DATA_PARALLEL_GROUP = "data_parallel_group"
  390. GLOBAL_RANK = "global_rank"
  391. #########################################
  392. # EXPERT-DATA PARALLELISM TOPO Config
  393. #########################################
  394. USE_DATA_BEFORE_EXPERT_PARALLEL = "use_data_before_expert_parallelism"
  395. USE_DATA_BEFORE_EXPERT_PARALLEL_DEFAULT = False