test_lr_scheduler.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import deepspeed
  5. from deepspeed.ops.op_builder import CPUAdamBuilder
  6. from deepspeed.accelerator import get_accelerator
  7. from unit.common import DistributedTest
  8. from unit.simple_model import *
  9. from unit.checkpoint.common import checkpoint_correctness_verification
  10. import pytest
  11. @pytest.mark.parametrize('zero_stage, use_cpu_offload', [(0, False), (1, False), (2, False), (2, True), (3, False),
  12. (3, True)])
  13. class TestLRSchedulerCheckpoint(DistributedTest):
  14. world_size = 2
  15. def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
  16. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  17. pytest.skip("cpu-adam is not compatible")
  18. if get_accelerator().device_name() == 'cpu':
  19. pytest.skip("CPU accelerator does not support this test.")
  20. config_dict = {
  21. "train_batch_size": 2,
  22. "steps_per_print": 1,
  23. "optimizer": {
  24. "type": 'Adam',
  25. "params": {
  26. "lr": 0.00015,
  27. "betas": [0.8, 0.999],
  28. "eps": 1e-8,
  29. "weight_decay": 3e-7
  30. }
  31. },
  32. "zero_optimization": {
  33. "stage": zero_stage,
  34. "cpu_offload": use_cpu_offload
  35. },
  36. "scheduler": {
  37. "type": "WarmupLR",
  38. "params": {
  39. "warmup_min_lr": 0,
  40. "warmup_max_lr": 0.001,
  41. "warmup_num_steps": 1000
  42. }
  43. }
  44. }
  45. if get_accelerator().is_fp16_supported():
  46. config_dict["fp16"] = {"enabled": True}
  47. elif get_accelerator().is_bf16_supported():
  48. config_dict["bf16"] = {"enabled": True}
  49. hidden_dim = 10
  50. if zero_stage == 3:
  51. global DeepSpeedZeroOptimizer_Stage3
  52. from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
  53. with deepspeed.zero.Init():
  54. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  55. else:
  56. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  57. checkpoint_correctness_verification(config_dict,
  58. models,
  59. hidden_dim,
  60. tmpdir,
  61. load_optimizer_states=False,
  62. load_lr_scheduler_states=True)
  63. def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
  64. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  65. pytest.skip("cpu-adam is not compatible")
  66. if get_accelerator().device_name() == 'cpu':
  67. pytest.skip("CPU accelerator does not support this test.")
  68. config_dict = {
  69. "train_batch_size": 2,
  70. "steps_per_print": 1,
  71. "optimizer": {
  72. "type": 'Adam',
  73. "params": {
  74. "lr": 1e-5
  75. }
  76. },
  77. "zero_optimization": {
  78. "stage": zero_stage,
  79. "cpu_offload": use_cpu_offload
  80. },
  81. "scheduler": {
  82. "type": "WarmupLR",
  83. "params": {
  84. "warmup_min_lr": 0,
  85. "warmup_max_lr": 0.001,
  86. "warmup_num_steps": 1000
  87. }
  88. },
  89. }
  90. if get_accelerator().is_fp16_supported():
  91. config_dict["fp16"] = {"enabled": True}
  92. elif get_accelerator().is_fp16_supported():
  93. config_dict["bf16"] = {"enabled": True}
  94. hidden_dim = 10
  95. if zero_stage == 3:
  96. with deepspeed.zero.Init():
  97. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  98. else:
  99. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  100. checkpoint_correctness_verification(config_dict,
  101. models,
  102. hidden_dim,
  103. tmpdir,
  104. load_optimizer_states=False,
  105. load_lr_scheduler_states=False)