123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- """
- Various symbolic constants used for model checkpointing
- """
- #########################################
- # Optimizer checkpoint keys
- #########################################
- OPTIMIZER_STATE_DICT = "optimizer_state_dict"
- FP32_GROUPS = "fp32_groups"
- FP32_FLAT_GROUPS = 'fp32_flat_groups'
- BASE_OPTIMIZER_STATE = 'base_optimizer_state'
- BASE_OPTIMIZER_STATE_STEP = 'base_optimizer_state_step'
- SINGLE_PARTITION_OF_FP32_GROUPS = "single_partition_of_fp32_groups"
- PARAM_GROUPS = 'param_groups'
- GROUP_PADDINGS = 'group_paddings'
- PARTITION_COUNT = 'partition_count'
- ZERO_STAGE = 'zero_stage'
- CLIP_GRAD = 'clip_grad'
- FP32_WEIGHT_KEY = "fp32"
- LOSS_SCALER = 'loss_scaler'
- #########################################
- # Module checkpoint keys
- #########################################
- PARAM = 'param'
- PARAM_SHAPES = 'param_shapes'
- BUFFER_NAMES = 'buffer_names'
- FROZEN_PARAM_SHAPES = 'frozen_param_shapes'
- FROZEN_PARAM_FRAGMENTS = 'frozen_param_fragments'
- #########################################
- # Checkpoint naming constants
- #########################################
- MODEL_FILE_PREFIX = 'mp_rank_'
- ZERO_FILE_PREFIX = 'zero_pp_rank_'
- OPTIM_FILE_SUFFIX = '_optim_states.pt'
- MODEL_FILE_SUFFIX = '_model_states.pt'
- LAYER_FILE_PREFIX = 'layer_'
- BF16_ZERO_FILE_PREFIX = 'bf16_' + ZERO_FILE_PREFIX
- FP16_ZERO_FILE_PREFIX = 'fp16_' + ZERO_FILE_PREFIX
- #########################################
- # Checkpoint utility keys
- #########################################
- DS_VERSION = 'ds_version'
- #########################################
- # Universal Checkpoint keys
- #########################################
- UNIVERSAL_CHECKPOINT_INFO = 'universal_checkpoint_info'
- UNIVERSAL_CHECKPOINT_VERSION_KEY = 'universal_checkpoint_version'
- # Reserve version 0.1 for the hardcoded logic used in BLOOM-176B training
- UNIVERSAL_CHECKPOINT_VERSION_VALUE = 0.2
- # Vocabulary padding
- VOCAB_TENSOR = 'vocab_tensor'
- PADDED_VOCAB_SIZE = 'padded_vocab_size'
- ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
- # Parameter splitting/merging
- PARAM_SLICE_MAPPINGS = 'param_slice_mappings'
- CAT_DIM = "cat_dim"
- # Following is a special case where a parameter effectively contains sub parameters.
- # As an example, consider Megatron-DeepSpeed GPT SWIGLU implementation (mlp.h_to_4h).
- # In this case, a single parameter ia allocated contiguously, but used as separate parameters.
- # When using universal checkpoint, we have to normalize the representation of the full parameter.
- # We normalize it by concatenating all slices of the sub params and then concatenating the sub params.
- # All concat operations are done on CAT_DIM (currently, no support for different concat dims sub params and TP slicing).
- # Similarly, load_hp_checkpoint_state has to take the needed actions when loading from universal.
- PARAM_N_SUB_PARAMS = "param_n_sub_params"
- SUB_PARAM_SHAPE = "sub_param_shape"
- # Regex list of parameters that require special handling
- VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns'
- PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns'
- PARAMETER_TO_AVERAGE_PATTERNS = 'parameter_to_average_patterns'
- PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns'
- TP_REPLICATED_PARAMETER_PATTERNS = 'tp_replicated_parameter_patterns'
- PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0 = 'parameter_with_2_sub_params_cat_dim_0'
- PARAMETER_WITH_SUB_PARAMS = 'parameter_with_sub_params'
- SUB_PARAMS_SHAPE = 'sub_params_shape'
|