test_pld.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import numpy as np
  5. import deepspeed
  6. import pytest
  7. from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
  8. from unit.common import DistributedTest
  9. from unit.simple_model import SimpleModel, PLD_SimpleModel, random_dataloader
  10. @pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
  11. def test_pld_schedule(tmpdir, theta):
  12. gamma = 0.001
  13. pld_scheduler = ProgressiveLayerDrop(theta, gamma)
  14. for i in range(10):
  15. pld_scheduler.update_state(i)
  16. expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
  17. actual_theta = pld_scheduler.get_theta()
  18. assert expected_theta == actual_theta
  19. @pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
  20. class TestPLDModel(DistributedTest):
  21. world_size = 1
  22. def test_pld_model(self, theta):
  23. gamma = 0.001
  24. config_dict = {
  25. "train_batch_size": 1,
  26. "steps_per_print": 1,
  27. "optimizer": {
  28. "type": 'Adam',
  29. "params": {
  30. "lr": 0.0001
  31. }
  32. },
  33. "fp16": {
  34. "enabled": True
  35. },
  36. "progressive_layer_drop": {
  37. "enabled": True,
  38. "theta": theta,
  39. "gamma": gamma
  40. }
  41. }
  42. hidden_dim = 10
  43. model = PLD_SimpleModel(hidden_dim, empty_grad=False)
  44. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  45. data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
  46. for i, batch in enumerate(data_loader):
  47. loss = model(batch[0], batch[1])
  48. model.backward(loss)
  49. model.step()
  50. expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
  51. actual_theta = model.get_pld_theta()
  52. assert expected_theta == actual_theta
  53. class TestNonPLDModel(DistributedTest):
  54. world_size = 1
  55. def test_non_pld_model(self):
  56. gamma = 0.001
  57. theta = 0.5
  58. config_dict = {
  59. "train_batch_size": 1,
  60. "steps_per_print": 1,
  61. "optimizer": {
  62. "type": 'Adam',
  63. "params": {
  64. "lr": 0.0001
  65. }
  66. },
  67. "fp16": {
  68. "enabled": True
  69. },
  70. "progressive_layer_drop": {
  71. "enabled": True,
  72. "theta": theta,
  73. "gamma": gamma
  74. }
  75. }
  76. hidden_dim = 10
  77. model = SimpleModel(hidden_dim, empty_grad=False)
  78. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  79. data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device)
  80. for i, batch in enumerate(data_loader):
  81. with pytest.raises(TypeError):
  82. loss = model(batch[0], batch[1])