constants.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. """
  2. Copyright 2020 The Microsoft DeepSpeed Team
  3. """
  4. #########################################
  5. # Elasticity
  6. #########################################
  7. ''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible
  8. with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that
  9. can support a large number of GPUs based on the user specified parameters
  10. '''
  11. FORMAT = '''
  12. Elasticity should be enabled as:
  13. "elasticity": {
  14. "enabled": true,
  15. "max_train_batch_size": 2000,
  16. "micro_batch_sizes": [2,4,6],
  17. "min_gpus": 1,
  18. "max_gpus" : 10000,
  19. "min_time": 20,
  20. "prefer_larger_batch": true,
  21. "ignore_non_elastic_batch_info": false,
  22. "version": 0.1
  23. }
  24. '''
  25. ELASTICITY = 'elasticity'
  26. # Current elasticity version
  27. LATEST_ELASTICITY_VERSION = 0.2
  28. ENABLED = 'enabled'
  29. ENABLED_DEFAULT = False
  30. # Max acceptable train_batch_size
  31. MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size'
  32. MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000
  33. # Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu
  34. MICRO_BATCHES = 'micro_batch_sizes'
  35. MICRO_BATCHES_DEFAULT = [2, 4, 6]
  36. # Min/max of GPUs to search over
  37. MIN_GPUS = 'min_gpus'
  38. MIN_GPUS_DEFAULT = 1
  39. MAX_GPUS = 'max_gpus'
  40. MAX_GPUS_DEFAULT = 10000
  41. NUM_GPUS_PER_NODE = 'num_gpus_per_node'
  42. NUM_GPUS_PER_NODE_DEFAULT = 1
  43. MODEL_PARLLEL_SIZE = "model_parallel_size"
  44. MODEL_PARLLEL_SIZE_DEFAULT = 1
  45. # Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
  46. MIN_TIME = "min_time"
  47. MIN_TIME_DEFAULT = 0
  48. # When finding a suitable batch size, attempt to find one that is closest
  49. # to the max train batch size given.
  50. PREFER_LARGER_BATCH = 'prefer_larger_batch'
  51. PREFER_LARGER_BATCH_DEFAULT = True
  52. # In order to reduce confusion, if elastic mode is enabled we
  53. # require (via assert) that no batch info is set outside of the
  54. # elastic config. You can turn off this assert via this config
  55. # but keep in mind that all batch info defined outside the
  56. # elastic mode *will be ignored*.
  57. IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info'
  58. IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False
  59. # Version of elastic logic to use
  60. VERSION = "version"
  61. VERSION_DEFAULT = LATEST_ELASTICITY_VERSION
  62. # Minimum deepspeed version to use elasticity
  63. MINIMUM_DEEPSPEED_VERSION = "0.3.8"
  64. # Environment variable storing elastic config from resource scheduler
  65. DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG"