constants.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. #########################################
  5. # nebula
  6. #########################################
  7. # Nebula. By default, this feature is not enabled.
  8. # Users can configure in ds_config.json as below example:
  9. NEBULA_FORMAT = '''
  10. nebula should be enabled as:
  11. "session_params": {
  12. "nebula": {
  13. "enabled": true,
  14. "persistent_storage_path": "/foo/bar",
  15. "persistent_time_interval": 100,
  16. "num_of_version_in_retention": 2,
  17. "enable_nebula_load": true
  18. }
  19. }
  20. '''
  21. NEBULA = "nebula"
  22. NEBULA_ENABLED = "enabled"
  23. NEBULA_ENABLED_DEFAULT = False
  24. # There is a case where customer want to load the checkpoint saved
  25. # by raw torch. Because nebula cannot load torch checkpoint directly
  26. # as they have different folder structures to bring the gap for
  27. # loading(the data are totally same in bytes for torch and nebula
  28. # saving).
  29. # In this case, we must disable nebula load to use raw torch load.
  30. # Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
  31. # original way of deepspeed to load, i.e. set the value of "--load".
  32. NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load"
  33. NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True
  34. # When you want to resume the previous checkpoint saved by nebula,
  35. # you can set NEBULA_LOAD_PATH as the parent folder of checkpoint.
  36. # If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH
  37. # will be the default path to load.
  38. NEBULA_LOAD_PATH = "nebula_load_path"
  39. NEBULA_LOAD_PATH_DEFAULT = None
  40. # Nebula will save the checkpoint under NEBULA_LOAD_PATH in the
  41. # asynchronous way.
  42. NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path"
  43. NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None
  44. # Time interval to trigger the nebula persistence.
  45. NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval"
  46. NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
  47. # Checkpoint number which will be kept in memory. Let us say,
  48. # if the value is 2. Then we have checkpoints 1 and 2 are ready
  49. # now. When it comes to checkpoint 3, the 1 will be removed if
  50. # 1 has been persisted to disk.
  51. NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
  52. NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
  53. # Nebula envs
  54. NEBULA_EXPORT_ENVS = [
  55. 'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL',
  56. 'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE',
  57. 'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT', 'AZUREML_RUN_ID', 'NEBULA_MEMORY_BUFFER_SIZE',
  58. 'AZUREML_PARAMETER_ITPJOB_NAME', 'FC_TASKROLE_NAME', 'FC_TASK_INDEX', 'MASTER_HOST', 'LOCAL_HOST',
  59. 'AZUREML_BLOB_ACCOUNT_NAME', 'AZUREML_BLOB_ACCOUNT_KEY'
  60. ]
  61. # ITP env files
  62. DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'