test_dynamic_loss_scale.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import torch
  5. import deepspeed
  6. import numpy as np
  7. from unit.common import DistributedTest
  8. from unit.simple_model import SimpleModel
  9. def run_model_step(model, gradient_list):
  10. for value in gradient_list:
  11. for p in model.parameters():
  12. p.grad = torch.empty_like(p, dtype=p.dtype)
  13. p.grad.fill_(value)
  14. model.step()
  15. class TestFused(DistributedTest):
  16. world_size = 1
  17. def test_no_overflow(self):
  18. config_dict = {
  19. "train_batch_size": 1,
  20. "steps_per_print": 1,
  21. "optimizer": {
  22. "type": "Adam",
  23. "params": {
  24. "lr": 0.00015
  25. }
  26. },
  27. "fp16": {
  28. "enabled": True,
  29. "loss_scale": 0,
  30. "initial_scale_power": 8,
  31. "loss_scale_window": 2
  32. }
  33. }
  34. hidden_dim = 1
  35. model = SimpleModel(hidden_dim)
  36. model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  37. expected_loss_scale = 2**8
  38. expected_scale_window = 2
  39. # Ensure the dynamic loss scaler is correctly configured.
  40. assert optim.dynamic_loss_scale == True
  41. assert optim.cur_scale == expected_loss_scale
  42. assert optim.scale_window == expected_scale_window
  43. for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
  44. run_model_step(model, [value])
  45. assert optim.cur_scale == expected_loss_scale
  46. assert optim.cur_iter == (i + 1)
  47. if optim.cur_iter % expected_scale_window == 0:
  48. expected_loss_scale *= 2
  49. def test_all_overflow(self):
  50. config_dict = {
  51. "train_batch_size": 1,
  52. "steps_per_print": 1,
  53. "optimizer": {
  54. "type": "Adam",
  55. "params": {
  56. "lr": 0.00015
  57. }
  58. },
  59. "fp16": {
  60. "enabled": True,
  61. "loss_scale": 0,
  62. "initial_scale_power": 4,
  63. "loss_scale_window": 2
  64. }
  65. }
  66. hidden_dim = 1
  67. model = SimpleModel(hidden_dim)
  68. model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  69. expected_loss_scale = 2**4
  70. # Ensure the dynamic loss scaler is correctly configured.
  71. assert optim.dynamic_loss_scale == True
  72. assert optim.cur_scale == expected_loss_scale
  73. overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
  74. for i, value in enumerate(overflow_gradients):
  75. run_model_step(model, [value])
  76. expected_loss_scale = max(expected_loss_scale / 2, 1)
  77. assert optim.cur_scale == expected_loss_scale
  78. assert optim.cur_iter == (i + 1)
  79. def test_some_overflow(self):
  80. config_dict = {
  81. "train_batch_size": 1,
  82. "steps_per_print": 1,
  83. "optimizer": {
  84. "type": "Adam",
  85. "params": {
  86. "lr": 0.00015
  87. }
  88. },
  89. "fp16": {
  90. "enabled": True,
  91. "loss_scale": 0,
  92. "initial_scale_power": 8,
  93. "loss_scale_window": 2
  94. }
  95. }
  96. hidden_dim = 1
  97. model = SimpleModel(hidden_dim)
  98. model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  99. expected_loss_scale = 2**8
  100. expected_scale_window = 2
  101. expected_iteration = 0
  102. # Ensure the dynamic loss scaler is correctly configured.
  103. assert optim.dynamic_loss_scale == True
  104. assert optim.cur_scale == expected_loss_scale
  105. assert optim.scale_window == expected_scale_window
  106. # Run model with overflows to decrease scale
  107. overflow_gradients = [float('inf'), float('nan')]
  108. expected_iteration += len(overflow_gradients)
  109. run_model_step(model, overflow_gradients)
  110. expected_loss_scale /= (2**len(overflow_gradients))
  111. assert optim.cur_scale == expected_loss_scale
  112. assert optim.cur_iter == expected_iteration
  113. # Run model scale_window + 1 times to increase scale once
  114. normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
  115. expected_iteration += len(normal_gradients)
  116. run_model_step(model, normal_gradients)
  117. expected_loss_scale *= 2
  118. assert optim.cur_scale == expected_loss_scale
  119. assert optim.cur_iter == expected_iteration
  120. # Run model with overflows to decrease scale
  121. overflow_gradients = [float('inf')]
  122. expected_iteration += len(overflow_gradients)
  123. run_model_step(model, overflow_gradients)
  124. expected_loss_scale /= (2**len(overflow_gradients))
  125. assert optim.cur_scale == expected_loss_scale
  126. assert optim.cur_iter == expected_iteration
  127. class TestUnfused(DistributedTest):
  128. world_size = 1
  129. def test_no_overflow(self):
  130. config_dict = {
  131. "train_batch_size": 1,
  132. "steps_per_print": 1,
  133. "optimizer": {
  134. "type": "Lamb",
  135. "params": {
  136. "lr": 0.00015
  137. }
  138. },
  139. "fp16": {
  140. "enabled": True,
  141. "loss_scale": 0,
  142. "initial_scale_power": 8,
  143. "loss_scale_window": 2
  144. }
  145. }
  146. hidden_dim = 1
  147. model = SimpleModel(hidden_dim)
  148. model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  149. expected_loss_scale = 2**8
  150. expected_scale_window = 2
  151. # Ensure the dynamic loss scaler is correctly configured.
  152. assert optim.dynamic_loss_scale == True
  153. assert optim.cur_scale == expected_loss_scale
  154. assert optim.scale_window == expected_scale_window
  155. for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
  156. run_model_step(model, [value])
  157. assert optim.cur_scale == expected_loss_scale
  158. assert optim.cur_iter == (i + 1)
  159. if optim.cur_iter % expected_scale_window == 0:
  160. expected_loss_scale *= 2
  161. def test_all_overflow(self):
  162. config_dict = {
  163. "train_batch_size": 1,
  164. "steps_per_print": 1,
  165. "optimizer": {
  166. "type": "Lamb",
  167. "params": {
  168. "lr": 0.00015
  169. }
  170. },
  171. "fp16": {
  172. "enabled": True,
  173. "loss_scale": 0,
  174. "initial_scale_power": 4,
  175. "loss_scale_window": 2,
  176. "min_loss_scale": 0.25
  177. }
  178. }
  179. hidden_dim = 1
  180. model = SimpleModel(hidden_dim)
  181. model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  182. expected_loss_scale = 2**4
  183. expected_min_loss_scale = 0.25
  184. # Ensure the dynamic loss scaler is correctly configured.
  185. assert optim.dynamic_loss_scale == True
  186. assert optim.cur_scale == expected_loss_scale
  187. assert optim.min_loss_scale == expected_min_loss_scale
  188. overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
  189. for i, value in enumerate(overflow_gradients):
  190. run_model_step(model, [value])
  191. expected_loss_scale = max(expected_loss_scale / 2, expected_min_loss_scale)
  192. assert optim.cur_scale == expected_loss_scale
  193. assert optim.cur_iter == (i + 1)
  194. def test_some_overflow(self):
  195. config_dict = {
  196. "train_batch_size": 1,
  197. "steps_per_print": 1,
  198. "optimizer": {
  199. "type": "Lamb",
  200. "params": {
  201. "lr": 0.00015
  202. }
  203. },
  204. "fp16": {
  205. "enabled": True,
  206. "loss_scale": 0,
  207. "initial_scale_power": 8,
  208. "loss_scale_window": 2
  209. }
  210. }
  211. hidden_dim = 1
  212. model = SimpleModel(hidden_dim)
  213. model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  214. expected_loss_scale = 2**8
  215. expected_scale_window = 2
  216. expected_iteration = 0
  217. # Ensure the dynamic loss scaler is correctly configured.
  218. assert optim.dynamic_loss_scale == True
  219. assert optim.cur_scale == expected_loss_scale
  220. assert optim.scale_window == expected_scale_window
  221. # Run model with overflows to decrease scale
  222. overflow_gradients = [float('inf'), float('nan')]
  223. expected_iteration += len(overflow_gradients)
  224. run_model_step(model, overflow_gradients)
  225. expected_loss_scale /= (2**len(overflow_gradients))
  226. assert optim.cur_scale == expected_loss_scale
  227. assert optim.cur_iter == expected_iteration
  228. # Run model scale_window + 1 times to increase scale once
  229. normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
  230. expected_iteration += len(normal_gradients)
  231. run_model_step(model, normal_gradients)
  232. expected_loss_scale *= 2
  233. assert optim.cur_scale == expected_loss_scale
  234. assert optim.cur_iter == expected_iteration
  235. # Run model with overflows to decrease scale
  236. overflow_gradients = [float('inf')]
  237. expected_iteration += len(overflow_gradients)
  238. run_model_step(model, overflow_gradients)
  239. expected_loss_scale /= (2**len(overflow_gradients))
  240. assert optim.cur_scale == expected_loss_scale
  241. assert optim.cur_iter == expected_iteration