# Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team ######################################### # nebula ######################################### # Nebula. By default, this feature is not enabled. # Users can configure in ds_config.json as below example: NEBULA_FORMAT = ''' nebula should be enabled as: "session_params": { "nebula": { "enabled": true, "persistent_storage_path": "/foo/bar", "persistent_time_interval": 100, "num_of_version_in_retention": 2, "enable_nebula_load": true } } ''' NEBULA = "nebula" NEBULA_ENABLED = "enabled" NEBULA_ENABLED_DEFAULT = False # There is a case where customer want to load the checkpoint saved # by raw torch. Because nebula cannot load torch checkpoint directly # as they have different folder structures to bring the gap for # loading(the data are totally same in bytes for torch and nebula # saving). # In this case, we must disable nebula load to use raw torch load. # Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use # original way of deepspeed to load, i.e. set the value of "--load". NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load" NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True # When you want to resume the previous checkpoint saved by nebula, # you can set NEBULA_LOAD_PATH as the parent folder of checkpoint. # If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH # will be the default path to load. NEBULA_LOAD_PATH = "nebula_load_path" NEBULA_LOAD_PATH_DEFAULT = None # Nebula will save the checkpoint under NEBULA_LOAD_PATH in the # asynchronous way. NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path" NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None # Time interval to trigger the nebula persistence. NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval" NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100 # Checkpoint number which will be kept in memory. Let us say, # if the value is 2. Then we have checkpoints 1 and 2 are ready # now. When it comes to checkpoint 3, the 1 will be removed if # 1 has been persisted to disk. NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention" NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2 # Nebula envs NEBULA_EXPORT_ENVS = [ 'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL', 'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE', 'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT', 'AZUREML_RUN_ID', 'NEBULA_MEMORY_BUFFER_SIZE', 'AZUREML_PARAMETER_ITPJOB_NAME', 'FC_TASKROLE_NAME', 'FC_TASK_INDEX', 'MASTER_HOST', 'LOCAL_HOST', 'AZUREML_BLOB_ACCOUNT_NAME', 'AZUREML_BLOB_ACCOUNT_KEY' ] # ITP env files DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'