constants.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. '''Copyright The Microsoft DeepSpeed Team'''
  2. """
  3. Copyright (c) Microsoft Corporation
  4. Licensed under the MIT license.
  5. """
  6. #########################################
  7. # nebula
  8. #########################################
  9. # Nebula. By default, this feature is not enabled.
  10. # Users can configure in ds_config.json as below example:
  11. NEBULA_FORMAT = '''
  12. nebula should be enabled as:
  13. "session_params": {
  14. "nebula": {
  15. "enabled": true,
  16. "persistent_storage_path": "/foo/bar",
  17. "persistent_time_interval": 100,
  18. "num_of_version_in_retention": 2,
  19. "enable_nebula_load": true
  20. }
  21. }
  22. '''
  23. NEBULA = "nebula"
  24. NEBULA_ENABLED = "enabled"
  25. NEBULA_ENABLED_DEFAULT = False
  26. # There is a case where customer want to load the checkpoint saved
  27. # by raw torch. Because nebula cannot load torch checkpoint directly
  28. # as they have different folder structures to bring the gap for
  29. # loading(the data are totaly same in bytes for torch and enbula s
  30. # aving).
  31. # In this case, we must disable nebula load to use raw torch load.
  32. # Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
  33. # original way of deepspeed to load, i.e. set the value of "--load".
  34. NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load"
  35. NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True
  36. # When you want to resume the previous checkpoint saved by nebula,
  37. # you can set NEBULA_LOAD_PATH as the parent folder of checkpoint.
  38. # If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH
  39. # will be the default path to load.
  40. NEBULA_LOAD_PATH = "nebula_load_path"
  41. NEBULA_LOAD_PATH_DEFAULT = None
  42. # Nebula will save the checkpoint under NEBULA_LOAD_PATH in the
  43. # asynchronous way.
  44. NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path"
  45. NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None
  46. # Time interval to trigger the nebula persistence.
  47. NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval"
  48. NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
  49. # Checkpoint number which will be kept in memory. Let us say,
  50. # if the value is 2. Then we have checkpoints 1 and 2 are ready
  51. # now. When it comes to checkpoint 3, the 1 will be removed if
  52. # 1 has been persisted to disk.
  53. NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
  54. NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
  55. # Neubla envs
  56. NEBULA_EXPORT_ENVS = [
  57. 'DLTS_JOB_ID',
  58. 'DLTS_NUM_WORKER',
  59. 'NEBULA_PERSISTENT_STORAGE_PATH',
  60. 'NEBULA_PERSISTENT_TIME_INTERVAL',
  61. 'AML_RUN_ID',
  62. 'AZUREML_RUN_TOKEN',
  63. 'AZUREML_WORKSPACE_SCOPE',
  64. 'AZUREML_EXPERIMENT_SCOPE',
  65. 'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
  66. 'AZUREML_RUN_ID',
  67. 'NEBULA_MEMORY_BUFFER_SIZE',
  68. 'AZUREML_PARAMETER_ITPJOB_NAME',
  69. 'FC_TASKROLE_NAME',
  70. 'FC_TASK_INDEX',
  71. 'MASTER_HOST',
  72. 'LOCAL_HOST',
  73. 'AZUREML_BLOB_ACCOUNT_NAME',
  74. 'AZUREML_BLOB_ACCOUNT_KEY'
  75. ]
  76. # ITP env files
  77. DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'