test_lr_scheduler.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import deepspeed
  5. from deepspeed.ops.op_builder import CPUAdamBuilder
  6. from unit.common import DistributedTest
  7. from unit.simple_model import *
  8. from unit.checkpoint.common import checkpoint_correctness_verification
  9. import pytest
  10. @pytest.mark.parametrize('zero_stage, use_cpu_offload', [(0, False), (1, False), (2, False), (2, True), (3, False),
  11. (3, True)])
  12. class TestLRSchedulerCheckpoint(DistributedTest):
  13. world_size = 2
  14. def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
  15. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  16. pytest.skip("cpu-adam is not compatible")
  17. config_dict = {
  18. "train_batch_size": 2,
  19. "steps_per_print": 1,
  20. "optimizer": {
  21. "type": 'Adam',
  22. "params": {
  23. "lr": 0.00015,
  24. "betas": [0.8, 0.999],
  25. "eps": 1e-8,
  26. "weight_decay": 3e-7
  27. }
  28. },
  29. "fp16": {
  30. "enabled": True
  31. },
  32. "zero_optimization": {
  33. "stage": zero_stage,
  34. "cpu_offload": use_cpu_offload
  35. },
  36. "scheduler": {
  37. "type": "WarmupLR",
  38. "params": {
  39. "warmup_min_lr": 0,
  40. "warmup_max_lr": 0.001,
  41. "warmup_num_steps": 1000
  42. }
  43. }
  44. }
  45. hidden_dim = 10
  46. if zero_stage == 3:
  47. global DeepSpeedZeroOptimizer_Stage3
  48. from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
  49. with deepspeed.zero.Init():
  50. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  51. else:
  52. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  53. checkpoint_correctness_verification(config_dict,
  54. models,
  55. hidden_dim,
  56. tmpdir,
  57. load_optimizer_states=False,
  58. load_lr_scheduler_states=True)
  59. def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
  60. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  61. pytest.skip("cpu-adam is not compatible")
  62. config_dict = {
  63. "train_batch_size": 2,
  64. "steps_per_print": 1,
  65. "optimizer": {
  66. "type": 'Adam',
  67. "params": {
  68. "lr": 1e-5
  69. }
  70. },
  71. "fp16": {
  72. "enabled": True
  73. },
  74. "zero_optimization": {
  75. "stage": zero_stage,
  76. "cpu_offload": use_cpu_offload
  77. },
  78. "scheduler": {
  79. "type": "WarmupLR",
  80. "params": {
  81. "warmup_min_lr": 0,
  82. "warmup_max_lr": 0.001,
  83. "warmup_num_steps": 1000
  84. }
  85. },
  86. }
  87. hidden_dim = 10
  88. if zero_stage == 3:
  89. with deepspeed.zero.Init():
  90. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  91. else:
  92. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  93. checkpoint_correctness_verification(config_dict,
  94. models,
  95. hidden_dim,
  96. tmpdir,
  97. load_optimizer_states=False,
  98. load_lr_scheduler_states=False)