offload_config.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. from enum import Enum
  5. from pathlib import Path
  6. from deepspeed.pydantic_v1 import Field, validator
  7. from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
  8. class OffloadDeviceEnum(str, Enum):
  9. """ Enum for valid offload devices """
  10. none = "none"
  11. cpu = "cpu"
  12. nvme = "nvme"
  13. class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
  14. """ Set options for parameter offload. Valid only with stage 3. """
  15. device: OffloadDeviceEnum = "none"
  16. """
  17. Device memory to offload model parameters. Supported options are `cpu` and
  18. `nvme`.
  19. """
  20. nvme_path: Path = None
  21. """ Filesystem path for NVMe device for parameter offloading. """
  22. buffer_count: int = Field(5, ge=0)
  23. """ Number of buffers in buffer pool for parameter offloading to NVMe. """
  24. buffer_size: int = Field(pp_int(1e8), ge=0)
  25. """ Size of buffers in buffer pool for parameter offloading to NVMe. """
  26. max_in_cpu: int = Field(pp_int(1e9), ge=0)
  27. """
  28. Number of parameter elements to maintain in CPU memory when offloading to
  29. NVMe is enabled.
  30. """
  31. pin_memory: bool = False
  32. """
  33. Offload to page-locked CPU memory. This could boost throughput at the cost
  34. of extra memory overhead.
  35. """
  36. class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
  37. """ Set options for optimizer offload. Valid with stage 1, 2, and 3. """
  38. device: OffloadDeviceEnum = "none"
  39. """
  40. Device memory to offload optimizer state. Supported options are `cpu` and
  41. `nvme`. Optimizer computation is offload to CPU regardless of device option.
  42. """
  43. nvme_path: Path = None
  44. """ Filesystem path for NVMe device for optimizer state offloading. """
  45. buffer_count: int = Field(4, ge=0)
  46. """
  47. Number of buffers in buffer pool for optimizer state offloading to NVMe.
  48. This should be at least the number of states maintained per parameter by
  49. the optimizer. For example, Adam optimizer has 4 states (parameter,
  50. gradient, momentum, and variance).
  51. """
  52. pin_memory: bool = False
  53. """
  54. Offload to page-locked CPU memory. This could boost throughput at the cost
  55. of extra memory overhead.
  56. """
  57. pipeline_read: bool = False
  58. """
  59. For tile-based optimizer step processing, overlap read of next tile with
  60. computation of current tile. Used in ZeRO-Infinity.
  61. """
  62. pipeline_write: bool = False
  63. """
  64. For tile-based optimizer step processing, overlap write of previous tile
  65. with computation of current tile.
  66. """
  67. fast_init: bool = False
  68. """ Enable fast optimizer initialization when offloading to NVMe. """
  69. @validator("pipeline_read", "pipeline_write", always=True)
  70. def set_pipeline(cls, field_value, values):
  71. values["pipeline"] = field_value or values.get("pipeline", False)
  72. return field_value
  73. ratio: float = Field(1.0, ge=0.0, le=1.0)
  74. """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3."""