test_pld.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import numpy as np
  2. import deepspeed
  3. import pytest
  4. from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
  5. from .common import distributed_test
  6. from .simple_model import SimpleModel, PLD_SimpleModel, random_dataloader, args_from_dict
  7. @pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
  8. def test_pld_schedule(tmpdir, theta):
  9. gamma = 0.001
  10. pld_scheduler = ProgressiveLayerDrop(theta, gamma)
  11. for i in range(10):
  12. pld_scheduler.update_state(i)
  13. expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
  14. actual_theta = pld_scheduler.get_theta()
  15. assert expected_theta == actual_theta
  16. @pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
  17. def test_pld_model(tmpdir, theta):
  18. gamma = 0.001
  19. config_dict = {
  20. "train_batch_size": 1,
  21. "steps_per_print": 1,
  22. "optimizer": {
  23. "type": 'Adam',
  24. "params": {
  25. "lr": 0.0001
  26. }
  27. },
  28. "fp16": {
  29. "enabled": True
  30. },
  31. "progressive_layer_drop": {
  32. "enabled": True,
  33. "theta": theta,
  34. "gamma": gamma
  35. }
  36. }
  37. args = args_from_dict(tmpdir, config_dict)
  38. hidden_dim = 10
  39. model = PLD_SimpleModel(hidden_dim, empty_grad=False)
  40. @distributed_test(world_size=[1])
  41. def _test_pld_model(args, model, hidden_dim, theta, gamma):
  42. model, _, _, _ = deepspeed.initialize(args=args,
  43. model=model,
  44. model_parameters=model.parameters())
  45. data_loader = random_dataloader(model=model,
  46. total_samples=50,
  47. hidden_dim=hidden_dim,
  48. device=model.device)
  49. for i, batch in enumerate(data_loader):
  50. loss = model(batch[0], batch[1])
  51. model.backward(loss)
  52. model.step()
  53. expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
  54. actual_theta = model.get_pld_theta()
  55. assert expected_theta == actual_theta
  56. _test_pld_model(args=args,
  57. model=model,
  58. hidden_dim=hidden_dim,
  59. theta=theta,
  60. gamma=gamma)
  61. def test_non_pld_model(tmpdir):
  62. gamma = 0.001
  63. theta = 0.5
  64. config_dict = {
  65. "train_batch_size": 1,
  66. "steps_per_print": 1,
  67. "optimizer": {
  68. "type": 'Adam',
  69. "params": {
  70. "lr": 0.0001
  71. }
  72. },
  73. "fp16": {
  74. "enabled": True
  75. },
  76. "progressive_layer_drop": {
  77. "enabled": True,
  78. "theta": theta,
  79. "gamma": gamma
  80. }
  81. }
  82. args = args_from_dict(tmpdir, config_dict)
  83. hidden_dim = 10
  84. model = SimpleModel(hidden_dim, empty_grad=False)
  85. @distributed_test(world_size=[1])
  86. def _test_non_pld_model(args, model, hidden_dim):
  87. model, _, _, _ = deepspeed.initialize(args=args,
  88. model=model,
  89. model_parameters=model.parameters())
  90. data_loader = random_dataloader(model=model,
  91. total_samples=1,
  92. hidden_dim=hidden_dim,
  93. device=model.device)
  94. for i, batch in enumerate(data_loader):
  95. with pytest.raises(TypeError):
  96. loss = model(batch[0], batch[1])
  97. _test_non_pld_model(args=args, model=model, hidden_dim=hidden_dim)