config.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. """
  2. Copyright (c) Microsoft Corporation
  3. Licensed under the MIT license.
  4. """
  5. from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
  6. #########################################
  7. # DeepSpeed Activation Checkpointing
  8. #########################################
  9. # Activation Checkpointing Allows to save memory by only keeping a select few
  10. #activations for the backpropagation.
  11. ACTIVATION_CHKPT_FORMAT = '''
  12. Activation Checkpointing should be configured as:
  13. "session_params": {
  14. "activation_checkpointing": {
  15. "partitioned_activations": [true|false],
  16. "number_checkpoints": 100,
  17. "contiguous_memory_optimization": [true|false],
  18. "cpu_checkpointing": [true|false]
  19. "profile": [true|false],
  20. "synchronize_checkpoint_boundary": [true|false],
  21. }
  22. }
  23. '''
  24. ACT_CHKPT_PARTITION_ACTIVATIONS = 'partition_activations'
  25. ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT = False
  26. ACT_CHKPT_NUMBER_CHECKPOINTS = 'number_checkpoints'
  27. ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT = None
  28. ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION = 'contiguous_memory_optimization'
  29. ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT = False
  30. ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY = 'synchronize_checkpoint_boundary'
  31. ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT = False
  32. ACT_CHKPT_PROFILE = 'profile'
  33. ACT_CHKPT_PROFILE_DEFAULT = False
  34. ACT_CHKPT_CPU_CHECKPOINTING = 'cpu_checkpointing'
  35. ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT = False
  36. ACT_CHKPT = 'activation_checkpointing'
  37. ACT_CHKPT_DEFAULT = {
  38. ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT,
  39. ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT,
  40. ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION:
  41. ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
  42. ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY:
  43. ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
  44. ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT,
  45. ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT
  46. }
  47. class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
  48. def __init__(self, param_dict):
  49. super(DeepSpeedActivationCheckpointingConfig, self).__init__()
  50. self.partition_activations = None
  51. self.contiguous_memory_optimization = None
  52. self.cpu_checkpointing = None
  53. self.number_checkpoints = None
  54. self.synchronize_checkpoint_boundary = None
  55. self.profile = None
  56. if ACT_CHKPT in param_dict.keys():
  57. act_chkpt_config_dict = param_dict[ACT_CHKPT]
  58. else:
  59. act_chkpt_config_dict = ACT_CHKPT_DEFAULT
  60. self._initialize(act_chkpt_config_dict)
  61. def _initialize(self, act_chkpt_config_dict):
  62. self.partition_activations = get_scalar_param(
  63. act_chkpt_config_dict,
  64. ACT_CHKPT_PARTITION_ACTIVATIONS,
  65. ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
  66. self.contiguous_memory_optimization = get_scalar_param(
  67. act_chkpt_config_dict,
  68. ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
  69. ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
  70. self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict,
  71. ACT_CHKPT_CPU_CHECKPOINTING,
  72. ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT)
  73. self.number_checkpoints = get_scalar_param(act_chkpt_config_dict,
  74. ACT_CHKPT_NUMBER_CHECKPOINTS,
  75. ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT)
  76. self.profile = get_scalar_param(act_chkpt_config_dict,
  77. ACT_CHKPT_PROFILE,
  78. ACT_CHKPT_PROFILE_DEFAULT)
  79. self.synchronize_checkpoint_boundary = get_scalar_param(
  80. act_chkpt_config_dict,
  81. ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
  82. ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)