constants.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. """
  2. Copyright (c) Microsoft Corporation
  3. Licensed under the MIT license.
  4. """
  5. #############################################
  6. # Routes
  7. #############################################
  8. ROUTE_TRAIN = "train"
  9. ROUTE_EVAL = "eval"
  10. ROUTE_PREDICT = "predict"
  11. ROUTE_ENCODE = "encode"
  12. #############################################
  13. # Batch size
  14. #############################################
  15. TRAIN_BATCH_SIZE = "train_batch_size"
  16. TRAIN_BATCH_SIZE_DEFAULT = None
  17. #############################################
  18. # Sparse attention
  19. #############################################
  20. SPARSE_ATTENTION = "sparse_attention"
  21. SPARSE_DENSE_MODE = "dense"
  22. SPARSE_FIXED_MODE = "fixed"
  23. SPARSE_VARIABLE_MODE = "variable"
  24. SPARSE_BIGBIRD_MODE = "bigbird"
  25. SPARSE_BSLONGFORMER_MODE = "bslongformer"
  26. SPARSE_MODE = "mode"
  27. SPARSE_MODE_DEFAULT = SPARSE_FIXED_MODE
  28. SPARSE_BLOCK = "block"
  29. SPARSE_BLOCK_DEFAULT = 16
  30. SPARSE_DIFFERENT_LAYOUT_PER_HEAD = "different_layout_per_head"
  31. SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT = False
  32. SPARSE_NUM_LOCAL_BLOCKS = "num_local_blocks"
  33. SPARSE_NUM_LOCAL_BLOCKS_DEFAULT = 4
  34. SPARSE_NUM_GLOBAL_BLOCKS = "num_global_blocks"
  35. SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT = 1
  36. SPARSE_ATTENTION_TYPE = "attention"
  37. SPARSE_ATTENTION_TYPE_DEFAULT = "bidirectional"
  38. SPARSE_HORIZONTAL_GLOBAL_ATTENTION = "horizontal_global_attention"
  39. SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT = False
  40. SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS = "num_different_global_patterns"
  41. SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT = 1
  42. SPARSE_NUM_RANDOM_BLOCKS = "num_random_blocks"
  43. SPARSE_NUM_RANDOM_BLOCKS_DEFAULT = 0
  44. SPARSE_LOCAL_WINDOW_BLOCKS = "local_window_blocks"
  45. SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT = [4]
  46. SPARSE_GLOBAL_BLOCK_INDICES = "global_block_indices"
  47. SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT = [0]
  48. SPARSE_GLOBAL_BLOCK_END_INDICES = "global_block_end_indices"
  49. SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT = None
  50. SPARSE_NUM_SLIDING_WINDOW_BLOCKS = "num_sliding_window_blocks"
  51. SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT = 3
  52. #############################################
  53. # Optimizer and lr scheduler
  54. #############################################
  55. OPTIMIZER = "optimizer"
  56. OPTIMIZER_TYPE_DEFAULT = None
  57. OPTIMIZER_PARAMS = "params"
  58. TYPE = "type"
  59. LEGACY_FUSION = "legacy_fusion"
  60. LEGACY_FUSION_DEFAULT = False
  61. SCHEDULER = "scheduler"
  62. SCHEDULER_TYPE_DEFAULT = None
  63. SCHEDULER_PARAMS = "params"
  64. MAX_GRAD_NORM = 'max_grad_norm'
  65. #############################################
  66. # Optimizer and lr scheduler
  67. #############################################
  68. ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
  69. ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
  70. # Steps
  71. STEPS_PER_PRINT = "steps_per_print"
  72. STEPS_PER_PRINT_DEFAULT = 10
  73. #########################################
  74. # Training micro batch size per GPU
  75. #########################################
  76. # Batch size for one training step. This is used when the
  77. # TRAIN_BATCH_SIZE cannot fit in GPU memory to determine
  78. # the number of gradient accumulation steps. By default, this
  79. # is set to None. Users can configure in ds_config.json as below example:
  80. TRAIN_MICRO_BATCH_SIZE_PER_GPU = '''
  81. TRAIN_MICRO_BATCH_SIZE_PER_GPU is defined in this format:
  82. "train_micro_batch_size_per_gpu": 1
  83. '''
  84. TRAIN_MICRO_BATCH_SIZE_PER_GPU = "train_micro_batch_size_per_gpu"
  85. TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = None
  86. #########################################
  87. # Gradient Accumulation
  88. #########################################
  89. # Gradient accumulation feature. By default, this feature is not enabled.
  90. # Users can configure in ds_config.json as below example:
  91. GRADIENT_ACCUMULATION_FORMAT = '''
  92. Gradient Accumulation should be of the format:
  93. "gradient_accumulation_steps": 1
  94. '''
  95. GRADIENT_ACCUMULATION_STEPS = "gradient_accumulation_steps"
  96. GRADIENT_ACCUMULATION_STEPS_DEFAULT = None
  97. # DeepSpeed CSR gradient sparsity
  98. SPARSE_GRADIENTS = "sparse_gradients"
  99. SPARSE_GRADIENTS_DEFAULT = False
  100. #########################################
  101. # BFLOAT16 support
  102. #########################################
  103. # BFLOAT16 feature. By default, this feature is not enabled.
  104. # Users can configure in ds_config.json as below example:
  105. BFLOAT16_FORMAT = '''
  106. BFLOAT16 parameters should be of the format:
  107. "bfloat16": {
  108. "enabled": true
  109. }
  110. '''
  111. BFLOAT16 = "bfloat16"
  112. BFLOAT16_ENABLED = "enabled"
  113. BFLOAT16_ENABLED_DEFAULT = False
  114. #########################################
  115. # FP16 support
  116. #########################################
  117. # FP16 feature. By default, this feature is not enabled.
  118. # Users can configure in ds_config.json as below example:
  119. FP16_FORMAT = '''
  120. FP16 parameters should be of the format:
  121. "fp16": {
  122. "enabled": true,
  123. "loss_scale": 0,
  124. "initial_scale_power": 32,
  125. "loss_scale_window": 1000,
  126. "hysteresis": 2,
  127. "min_loss_scale": 1
  128. }
  129. '''
  130. FP16 = "fp16"
  131. FP16_ENABLED = "enabled"
  132. FP16_ENABLED_DEFAULT = False
  133. # FP16 loss scale, zero means using dynamic scaling
  134. FP16_LOSS_SCALE = "loss_scale"
  135. FP16_LOSS_SCALE_DEFAULT = 0
  136. # FP16 initial dynamic scale loss power
  137. FP16_INITIAL_SCALE_POWER = "initial_scale_power"
  138. FP16_INITIAL_SCALE_POWER_DEFAULT = 32
  139. # FP16 loss scale window
  140. FP16_LOSS_SCALE_WINDOW = "loss_scale_window"
  141. FP16_LOSS_SCALE_WINDOW_DEFAULT = 1000
  142. # FP16 hysteresis
  143. FP16_HYSTERESIS = "hysteresis"
  144. FP16_HYSTERESIS_DEFAULT = 2
  145. # FP16 min loss scale
  146. FP16_MIN_LOSS_SCALE = "min_loss_scale"
  147. FP16_MIN_LOSS_SCALE_DEFAULT = 1
  148. # FP16 master and grads
  149. FP16_MASTER_WEIGHTS_AND_GRADS = "fp16_master_weights_and_grads"
  150. FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT = False
  151. #########################################
  152. # Apex AMP support
  153. #########################################
  154. # Use Apex AMP for mixed precision support, all parameters (other than 'enabled') will be passed to
  155. # amp.initialize(model, optimizer, **amp_params)
  156. # See apex documentation for supported parameters/features: https://nvidia.github.io/apex/amp.html#apex.amp.initialize
  157. AMP_FORMAT = '''
  158. "amp" {
  159. "enabled: true,
  160. "opt_level": "O1",
  161. ...
  162. }
  163. '''
  164. AMP = "amp"
  165. AMP_ENABLED = "enabled"
  166. AMP_ENABLED_DEFAULT = False
  167. #########################################
  168. # Gradient clipping
  169. #########################################
  170. # Gradient clipping. By default, this feature is not enabled.
  171. # Users can configure in ds_config.json as below example:
  172. GRADIENT_CLIPPING_FORMAT = '''
  173. Gradient clipping should be enabled as:
  174. "gradient_clipping": 1.0
  175. '''
  176. GRADIENT_CLIPPING = 'gradient_clipping'
  177. GRADIENT_CLIPPING_DEFAULT = 0.
  178. #########################################
  179. # Communication data type
  180. #########################################
  181. # Supported types: ['none', 'fp16', 'fp32']
  182. # By default, this feature is not enabled ('none' value)
  183. # Users can configure in ds_config.json as below example:
  184. COMMUNICATION_DATA_TYPE_FORMAT = '''
  185. Communication data type should be set as:
  186. "communication_data_type": "fp32"
  187. '''
  188. COMMUNICATION_DATA_TYPE = "communication_data_type"
  189. COMMUNICATION_DATA_TYPE_DEFAULT = None
  190. #########################################
  191. # Scale/predivide gradients before allreduce
  192. #########################################
  193. # Prescale gradients. By default, this feature is not enabled.
  194. # Users can configure in ds_config.json as below example:
  195. PRESCALE_GRADIENTS_FORMAT = '''
  196. Gradient prescaling should be enabled as:
  197. "prescale_gradients": true
  198. '''
  199. PRESCALE_GRADIENTS = "prescale_gradients"
  200. PRESCALE_GRADIENTS_DEFAULT = False
  201. GRADIENT_PREDIVIDE_FACTOR_FORMAT = '''
  202. Gradient predivide factor should be enabled as:
  203. "gradient_predivide_factor": 1.0
  204. '''
  205. GRADIENT_PREDIVIDE_FACTOR = "gradient_predivide_factor"
  206. GRADIENT_PREDIVIDE_FACTOR_DEFAULT = 1.0
  207. #########################################
  208. # Disable AllGather
  209. #########################################
  210. # Disable AllGather. By default, this feature is not enabled.
  211. # Users can configure in ds_config.json as below example:
  212. DISABLE_ALLGATHER_FORMAT = '''
  213. Disable AllGather should be enabled as:
  214. "disable_allgather": true
  215. '''
  216. DISABLE_ALLGATHER = "disable_allgather"
  217. DISABLE_ALLGATHER_DEFAULT = False
  218. #########################################
  219. # Dump DeepSpeed state
  220. #########################################
  221. # Dump State. By default, this feature is not enabled.
  222. # Users can configure in ds_config.json as below example:
  223. DUMP_STATE_FORMAT = '''
  224. Dump state should be enabled as:
  225. "dump_state": true
  226. '''
  227. DUMP_STATE = 'dump_state'
  228. DUMP_STATE_DEFAULT = False
  229. #########################################
  230. # Vocabulary size
  231. #########################################
  232. # Vocabulary size.
  233. # Users can configure in ds_config.json as below example:
  234. VOCABULARY_SIZE_FORMAT = '''
  235. Vocabulary size can be specified as:
  236. "vocabulary_size": 1024
  237. '''
  238. VOCABULARY_SIZE = 'vocabulary_size'
  239. VOCABULARY_SIZE_DEFAULT = None
  240. #########################################
  241. # Wall block breakdown
  242. #########################################
  243. # Wall clock breakdown. By default, this feature is not enabled.
  244. # Users can configure in ds_config.json as below example:
  245. WALL_CLOCK_BREAKDOWN_FORMAT = '''
  246. Wall block breakdown should be enabled as:
  247. "wall_clock_breakdown": true
  248. '''
  249. WALL_CLOCK_BREAKDOWN = 'wall_clock_breakdown'
  250. WALL_CLOCK_BREAKDOWN_DEFAULT = False
  251. MEMORY_BREAKDOWN = 'memory_breakdown'
  252. MEMORY_BREAKDOWN_DEFAULT = False
  253. #########################################
  254. # Tensorboard
  255. #########################################
  256. # Tensorboard. By default, this feature is not enabled.
  257. # Users can configure in ds_config.json as below example:
  258. TENSORBOARD_FORMAT = '''
  259. Tensorboard can be specified as:
  260. "tensorboard": {
  261. "enabled": true,
  262. "output_path": "/home/myname/foo",
  263. "job_name": "model_lr2e-5_epoch3_seed2_seq64"
  264. }
  265. '''
  266. TENSORBOARD = "tensorboard"
  267. # Tensorboard enable signal
  268. TENSORBOARD_ENABLED = "enabled"
  269. TENSORBOARD_ENABLED_DEFAULT = False
  270. # Tensorboard output path
  271. TENSORBOARD_OUTPUT_PATH = "output_path"
  272. TENSORBOARD_OUTPUT_PATH_DEFAULT = ""
  273. # Tensorboard job name
  274. TENSORBOARD_JOB_NAME = "job_name"
  275. TENSORBOARD_JOB_NAME_DEFAULT = "DeepSpeedJobName"
  276. #########################################
  277. # Eigenvalue
  278. #########################################
  279. # Eigenvalue computation. By default, this feature is not enabled.
  280. # Users can configure in ds_config.json as below example:
  281. EIGENVALUE_FORMAT = '''
  282. Tensorboard can be specified as:
  283. "eigenvalue": {
  284. "enabled": true,
  285. "verbose": true,
  286. "max_iter": 100,
  287. "tol": 1e-2,
  288. "stability": 1e-6
  289. }
  290. '''
  291. EIGENVALUE = "eigenvalue"
  292. # Tensorboard enable signal
  293. EIGENVALUE_ENABLED = "enabled"
  294. EIGENVALUE_ENABLED_DEFAULT = False
  295. EIGENVALUE_VERBOSE = "verbose"
  296. EIGENVALUE_VERBOSE_DEFAULT = False
  297. EIGENVALUE_MAX_ITER = "max_iter"
  298. EIGENVALUE_MAX_ITER_DEFAULT = 100
  299. EIGENVALUE_TOL = "tol"
  300. EIGENVALUE_TOL_DEFAULT = 1e-2
  301. EIGENVALUE_STABILITY = "stability"
  302. EIGENVALUE_STABILITY_DEFAULT = 1e-6
  303. EIGENVALUE_GAS_BOUNDARY_RESOLUTION = "gas_boundary_resolution"
  304. EIGENVALUE_GAS_BOUNDARY_RESOLUTION_DEFAULT = 1
  305. EIGENVALUE_LAYER_NAME = "layer_name"
  306. EIGENVALUE_LAYER_NAME_DEFAULT = "bert.encoder.layer"
  307. EIGENVALUE_LAYER_NUM = "layer_num"
  308. EIGENVALUE_LAYER_NUM_DEFAULT = 0
  309. #########################################
  310. # Progressive Layer Drop (PLD)
  311. #########################################
  312. PROGRESSIVE_LAYER_DROP = "progressive_layer_drop"
  313. # PLD enable signal
  314. PLD_ENABLED = "enabled"
  315. PLD_ENABLED_DEFAULT = False
  316. PLD_THETA = "theta"
  317. PLD_THETA_DEFAULT = 1.0
  318. PLD_GAMMA = "gamma"
  319. PLD_GAMMA_DEFAULT = 0.001
  320. #########################################
  321. # Curriculum Learning
  322. #########################################
  323. CURRICULUM_LEARNING = "curriculum_learning"
  324. CURRICULUM_ENABLED = "enabled"
  325. CURRICULUM_ENABLED_DEFAULT = False
  326. #########################################
  327. # Validation modes
  328. #########################################
  329. class ValidationMode:
  330. WARN = "WARN"
  331. IGNORE = "IGNORE"
  332. FAIL = "FAIL"
  333. #########################################
  334. # Checkpoint config params
  335. #########################################
  336. # "checkpoint": {tag_validation=["Ignore"|"Warn"|"Fail"]}
  337. CHECKPOINT = "checkpoint"
  338. CHECKPOINT_TAG_VALIDATION = "tag_validation"
  339. CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
  340. CHECKPOINT_TAG_VALIDATION_MODES = [
  341. ValidationMode.WARN,
  342. ValidationMode.IGNORE,
  343. ValidationMode.FAIL
  344. ]
  345. #########################################
  346. # Quantization
  347. #########################################
  348. QUANTIZE_TRAINING = "quantize_training"
  349. QUANTIZE_BITS = "quantize_bits"
  350. START_BITS = "start_bits"
  351. TARGET_BITS = "target_bits"
  352. QUANTIZER_KERNEL = "quantizer_kernel"
  353. QUANTIZE_SCHEDULE = "quantize_schedule"
  354. QUANTIZE_PERIOD = "quantize_period"
  355. SCHEDULE_OFFSET = "schedule_offset"
  356. QUANTIZE_GROUPS = "quantize_groups"
  357. FP16_MIXED_QUANTIZE = "fp16_mixed_quantize"
  358. QUANTIZE_CHANGE_RATIO = "quantize_change_ratio"
  359. FP16_MIXED_QUANTIZE_ENABLED = "enabled"
  360. QUANTIZE_VERBOSE = "quantize_verbose"
  361. QUANTIZE_ALGO = "quantize_algo"
  362. QUANTIZE_TYPE = "q_type"
  363. QUANTIZE_SYMMETRIC = "symmetric"
  364. QUANTIZE_ASYMMETRIC = "asymmetric"
  365. STOCHASTIC_ROUNDING = "stochastic"
  366. NEAREST_ROUNDING = "nearest"
  367. QUANTIZE_ROUNDING = "rounding"
  368. QUANTIZE_TRAINING_ENABLED = "enabled"
  369. QUANTIZE_TRAINING_ENABLED_DEFAULT = False
  370. QUANTIZE_TRAINING_DEFAULT = False
  371. QUANTIZE_START_BITS_DEFAULT = 16
  372. QUANTIZE_TARGET_BITS_DEFAULT = 8
  373. QUANTIZER_KERNEL_DEFAULT = False
  374. QUANTIZE_PERIOD_DEFAULT = 1000
  375. QUANTIZE_OFFSET_DEFAULT = 1000
  376. QUANTIZE_GROUPS_DEFAULT = 1
  377. QUANTIZE_TYPE_DEFAULT = 0 #symmetric
  378. QUANTIZE_ROUNDING_DEFAULT = 0 #nearest
  379. FP16_MIXED_QUANTIZE_ENABLED_DEFAULT = False
  380. QUANTIZE_CHANGE_RATIO_DEFAULT = 0.001
  381. QUANTIZE_VERBOSE_DEFAULT = False
  382. #########################################
  383. # Drop the last incomplete Batch
  384. # #########################################
  385. # dataloader_drop_last. By default, this feature is not enabled.
  386. # Users can configure in ds_config.json as below example:
  387. DATALOADER_DROP_LAST_FORMAT = '''
  388. The last incomplete batch can be dropped by setting:
  389. "dataloader_drop_last": True
  390. '''
  391. DATALOADER_DROP_LAST = "dataloader_drop_last"
  392. DATALOADER_DROP_LAST_DEFAULT = False
  393. #########################################
  394. # Optimizer checkpoint keys
  395. #########################################
  396. OPTIMIZER_STATE_DICT = "optimizer_state_dict"
  397. FP32_GROUPS = "fp32_groups"