constants.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. '''Copyright The Microsoft DeepSpeed Team'''
  2. '''
  3. Various symbolic constants used for model checkpointing
  4. '''
  5. #########################################
  6. # Optimizer checkpoint keys
  7. #########################################
  8. OPTIMIZER_STATE_DICT = "optimizer_state_dict"
  9. FP32_GROUPS = "fp32_groups"
  10. FP32_FLAT_GROUPS = 'fp32_flat_groups'
  11. BASE_OPTIMIZER_STATE = 'base_optimizer_state'
  12. SINGLE_PARTITION_OF_FP32_GROUPS = "single_partition_of_fp32_groups"
  13. GROUP_PADDINGS = 'group_paddings'
  14. PARTITION_COUNT = 'partition_count'
  15. ZERO_STAGE = 'zero_stage'
  16. CLIP_GRAD = 'clip_grad'
  17. FP32_WEIGHT_KEY = "fp32"
  18. #########################################
  19. # Module checkpoint keys
  20. #########################################
  21. PARAM = 'param'
  22. PARAM_SHAPES = 'param_shapes'
  23. BUFFER_NAMES = 'buffer_names'
  24. #########################################
  25. # Checkpoint naming constants
  26. #########################################
  27. MODEL_FILE_PREFIX = 'mp_rank_'
  28. ZERO_FILE_PREFIX = 'zero_pp_rank_'
  29. OPTIM_FILE_SUFFIX = '_optim_states.pt'
  30. MODEL_FILE_SUFFIX = '_model_states.pt'
  31. LAYER_FILE_PREFIX = 'layer_'
  32. BF16_ZERO_FILE_PREFIX = 'bf16_' + ZERO_FILE_PREFIX
  33. FP16_ZERO_FILE_PREFIX = 'fp16_' + ZERO_FILE_PREFIX
  34. #########################################
  35. # Checkpoint utility keys
  36. #########################################
  37. DS_VERSION = 'ds_version'
  38. #########################################
  39. # Universal Checkpoint keys
  40. #########################################
  41. UNIVERSAL_CHECKPOINT_INFO = 'universal_checkpoint_info'
  42. UNIVERSAL_CHECKPOINT_VERSION_KEY = 'universal_checkpoint_version'
  43. # Reserve version 0.1 for the hardcoded logic used in BLOOM-176B training
  44. UNIVERSAL_CHECKPOINT_VERSION_VALUE = 0.2
  45. # Vocabulary padding
  46. VOCAB_DIVISIBILITY_PADDING_TENSOR = 'vocab_divisibility_padding_tensor'
  47. PADDED_VOCAB_SIZE = 'padded_vocab_size'
  48. ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
  49. # Parameter splitting/merging
  50. PARAM_SLICE_MAPPINGS = 'param_slice_mappings'
  51. CAT_DIM = "cat_dim"
  52. # Regex list of parameters that require special handling
  53. VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns'
  54. PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns'
  55. PARAMETER_TO_AVERAGE_PATTERNS = 'parameter_to_average_patterns'
  56. PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns'