test_fp16.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044
  1. import torch
  2. import deepspeed.comm as dist
  3. import deepspeed
  4. import pytest
  5. from deepspeed.ops.adam import FusedAdam
  6. from .common import distributed_test
  7. from deepspeed.ops.op_builder import CPUAdamBuilder
  8. from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args, SimpleMoEModel, sequence_dataloader
  9. from .util import required_torch_version
  10. try:
  11. from apex import amp # noqa: F401
  12. _amp_available = True
  13. except ImportError:
  14. _amp_available = False
  15. amp_available = pytest.mark.skipif(not _amp_available,
  16. reason="apex/amp is not installed")
  17. def test_lamb_fp32_grad_clip(tmpdir):
  18. config_dict = {
  19. "train_batch_size": 2,
  20. "steps_per_print": 1,
  21. "optimizer": {
  22. "type": "Lamb",
  23. "params": {
  24. "lr": 0.00015
  25. }
  26. },
  27. "gradient_clipping": 1.0
  28. }
  29. args = args_from_dict(tmpdir, config_dict)
  30. hidden_dim = 10
  31. model = SimpleModel(hidden_dim)
  32. @distributed_test(world_size=[1, 2])
  33. def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
  34. model, _, _, _ = deepspeed.initialize(args=args,
  35. model=model,
  36. model_parameters=model.parameters())
  37. data_loader = random_dataloader(model=model,
  38. total_samples=50,
  39. hidden_dim=hidden_dim,
  40. device=model.device,
  41. dtype=torch.float)
  42. for n, batch in enumerate(data_loader):
  43. loss = model(batch[0], batch[1])
  44. model.backward(loss)
  45. model.step()
  46. _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)
  47. def test_lamb_fp16_basic(tmpdir):
  48. config_dict = {
  49. "train_batch_size": 2,
  50. "steps_per_print": 1,
  51. "optimizer": {
  52. "type": "Lamb",
  53. "params": {
  54. "lr": 0.00015
  55. }
  56. },
  57. "gradient_clipping": 1.0,
  58. "fp16": {
  59. "enabled": True
  60. }
  61. }
  62. args = args_from_dict(tmpdir, config_dict)
  63. hidden_dim = 10
  64. model = SimpleModel(hidden_dim)
  65. @distributed_test(world_size=[1, 2])
  66. def _test_lamb_fp16_basic(args, model, hidden_dim):
  67. model, _, _, _ = deepspeed.initialize(args=args,
  68. model=model,
  69. model_parameters=model.parameters())
  70. data_loader = random_dataloader(model=model,
  71. total_samples=50,
  72. hidden_dim=hidden_dim,
  73. device=model.device)
  74. for n, batch in enumerate(data_loader):
  75. loss = model(batch[0], batch[1])
  76. model.backward(loss)
  77. model.step()
  78. _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
  79. def test_lamb_fp16_empty_grad(tmpdir):
  80. config_dict = {
  81. "train_batch_size": 2,
  82. "steps_per_print": 1,
  83. "optimizer": {
  84. "type": "Lamb",
  85. "params": {
  86. "lr": 0.00015
  87. }
  88. },
  89. "gradient_clipping": 1.0,
  90. "fp16": {
  91. "enabled": True
  92. }
  93. }
  94. args = args_from_dict(tmpdir, config_dict)
  95. hidden_dim = 10
  96. model = SimpleModel(hidden_dim, empty_grad=True)
  97. @distributed_test(world_size=[2])
  98. def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
  99. model, _, _, _ = deepspeed.initialize(args=args,
  100. model=model,
  101. model_parameters=model.parameters())
  102. data_loader = random_dataloader(model=model,
  103. total_samples=50,
  104. hidden_dim=hidden_dim,
  105. device=model.device)
  106. for n, batch in enumerate(data_loader):
  107. loss = model(batch[0], batch[1])
  108. model.backward(loss)
  109. model.step()
  110. _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
  111. def test_adam_fp32_empty_grad(tmpdir):
  112. config_dict = {
  113. "train_batch_size": 2,
  114. "steps_per_print": 1,
  115. "optimizer": {
  116. "type": "Adam",
  117. "params": {
  118. "lr": 0.00015
  119. }
  120. },
  121. "gradient_clipping": 1.0,
  122. "fp16": {
  123. "enabled": False
  124. }
  125. }
  126. args = args_from_dict(tmpdir, config_dict)
  127. hidden_dim = 10
  128. model = SimpleModel(hidden_dim, empty_grad=True)
  129. @distributed_test(world_size=[2])
  130. def _test_adam_fp32_empty_grad(args, model, hidden_dim):
  131. model, _, _, _ = deepspeed.initialize(args=args,
  132. model=model,
  133. model_parameters=model.parameters())
  134. data_loader = random_dataloader(model=model,
  135. total_samples=50,
  136. hidden_dim=hidden_dim,
  137. device=model.device,
  138. dtype=torch.float)
  139. for n, batch in enumerate(data_loader):
  140. loss = model(batch[0], batch[1])
  141. model.backward(loss)
  142. model.step()
  143. _test_adam_fp32_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
  144. def test_adamw_fp16_basic(tmpdir):
  145. config_dict = {
  146. "train_batch_size": 1,
  147. "steps_per_print": 1,
  148. "fp16": {
  149. "enabled": True
  150. }
  151. }
  152. args = args_from_dict(tmpdir, config_dict)
  153. hidden_dim = 10
  154. model = SimpleModel(hidden_dim)
  155. @distributed_test(world_size=[1])
  156. def _test_adamw_fp16_basic(args, model, hidden_dim):
  157. optimizer = torch.optim.AdamW(params=model.parameters())
  158. model, _, _, _ = deepspeed.initialize(args=args,
  159. model=model,
  160. optimizer=optimizer)
  161. data_loader = random_dataloader(model=model,
  162. total_samples=50,
  163. hidden_dim=hidden_dim,
  164. device=model.device)
  165. for n, batch in enumerate(data_loader):
  166. loss = model(batch[0], batch[1])
  167. model.backward(loss)
  168. model.step()
  169. _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
  170. def test_unfused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch):
  171. if not required_torch_version():
  172. pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
  173. config_dict = {
  174. "train_batch_size": 2,
  175. "steps_per_print": 1,
  176. "fp16": {
  177. "enabled": True
  178. }
  179. }
  180. args = args_from_dict(tmpdir, config_dict)
  181. hidden_dim = 10
  182. def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
  183. torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
  184. all_gather_results = [
  185. torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
  186. ]
  187. dist.all_gather(all_gather_results, torch_norm_tensor)
  188. assert len(set([x.item() for x in all_gather_results])) == 1
  189. return 1.0
  190. @distributed_test(world_size=[2])
  191. def _test_unfused_fp16_optimizer(args, hidden_dim):
  192. # initialize MoE
  193. model = SimpleMoEModel(hidden_dim, ep_size=2)
  194. optimizer = torch.optim.AdamW(params=model.parameters())
  195. engine, optimizer, _, _ = deepspeed.initialize(args=args,
  196. model=model,
  197. optimizer=optimizer,
  198. dist_init_required=False)
  199. monkeypatch.setattr(optimizer,
  200. 'unscale_and_clip_grads',
  201. mock_unscale_and_clip_grads)
  202. data_loader = sequence_dataloader(model=engine,
  203. total_samples=50,
  204. hidden_dim=hidden_dim,
  205. device=engine.device)
  206. for n, batch in enumerate(data_loader):
  207. loss = engine(batch[0], batch[1])
  208. engine.backward(loss)
  209. engine.step()
  210. _test_unfused_fp16_optimizer(args=args, hidden_dim=hidden_dim)
  211. def test_fused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch):
  212. if not required_torch_version():
  213. pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
  214. config_dict = {
  215. "train_batch_size": 2,
  216. "steps_per_print": 1,
  217. "fp16": {
  218. "enabled": True
  219. }
  220. }
  221. args = args_from_dict(tmpdir, config_dict)
  222. hidden_dim = 10
  223. def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
  224. torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
  225. all_gather_results = [
  226. torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
  227. ]
  228. dist.all_gather(all_gather_results, torch_norm_tensor)
  229. assert len(set([x.item() for x in all_gather_results])) == 1
  230. return 1.0
  231. @distributed_test(world_size=[2])
  232. def _test_fused_fp16_optimizer(args, hidden_dim):
  233. # initialize MoE
  234. model = SimpleMoEModel(hidden_dim, ep_size=2)
  235. # optimizer = torch.optim.AdamW(params=model.parameters())
  236. optimizer = FusedAdam(params=model.parameters())
  237. engine, optimizer, _, _ = deepspeed.initialize(args=args,
  238. model=model,
  239. optimizer=optimizer,
  240. dist_init_required=False)
  241. monkeypatch.setattr(optimizer,
  242. 'unscale_and_clip_grads',
  243. mock_unscale_and_clip_grads)
  244. data_loader = sequence_dataloader(model=engine,
  245. total_samples=50,
  246. hidden_dim=hidden_dim,
  247. device=engine.device)
  248. for n, batch in enumerate(data_loader):
  249. loss = engine(batch[0], batch[1])
  250. engine.backward(loss)
  251. engine.step()
  252. _test_fused_fp16_optimizer(args=args, hidden_dim=hidden_dim)
  253. @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
  254. def test_lamb_optimizer_gradnorm_for_moe(tmpdir, monkeypatch, fused_lamb_legacy: bool):
  255. if not required_torch_version():
  256. pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
  257. config_dict = {
  258. "train_batch_size": 2,
  259. "steps_per_print": 1,
  260. "fp16": {
  261. "enabled": True
  262. },
  263. "optimizer": {
  264. "type": "Lamb",
  265. "params": {
  266. "lr": 0.00015
  267. }
  268. }
  269. }
  270. args = args_from_dict(tmpdir, config_dict)
  271. hidden_dim = 10
  272. def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
  273. torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
  274. all_gather_results = [
  275. torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
  276. ]
  277. dist.all_gather(all_gather_results, torch_norm_tensor)
  278. assert len(set([x.item() for x in all_gather_results])) == 1
  279. return 1.0
  280. @distributed_test(world_size=[2])
  281. def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy):
  282. # initialize MoE
  283. model = SimpleMoEModel(hidden_dim, ep_size=2)
  284. engine, optimizer, _, _ = deepspeed.initialize(args=args,
  285. model=model,
  286. model_parameters=model.parameters(),
  287. dist_init_required=False)
  288. monkeypatch.setattr(optimizer,
  289. 'unscale_and_clip_grads',
  290. mock_unscale_and_clip_grads)
  291. optimizer.fused_lamb_legacy = fused_lamb_legacy
  292. data_loader = sequence_dataloader(model=engine,
  293. total_samples=50,
  294. hidden_dim=hidden_dim,
  295. device=engine.device)
  296. for n, batch in enumerate(data_loader):
  297. loss = engine(batch[0], batch[1])
  298. engine.backward(loss)
  299. engine.step()
  300. _test_lamb_legacy_optimizer_step(args=args,
  301. hidden_dim=hidden_dim,
  302. fused_lamb_legacy=fused_lamb_legacy)
  303. def test_dict_config_adamw_fp16_basic():
  304. config = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
  305. args = create_deepspeed_args()
  306. hidden_dim = 10
  307. model = SimpleModel(hidden_dim)
  308. @distributed_test(world_size=[1])
  309. def _test_adamw_fp16_basic(args, model, hidden_dim, config):
  310. optimizer = torch.optim.AdamW(params=model.parameters())
  311. model, _, _, _ = deepspeed.initialize(args=args,
  312. model=model,
  313. optimizer=optimizer,
  314. config=config)
  315. data_loader = random_dataloader(model=model,
  316. total_samples=50,
  317. hidden_dim=hidden_dim,
  318. device=model.device)
  319. for n, batch in enumerate(data_loader):
  320. loss = model(batch[0], batch[1])
  321. model.backward(loss)
  322. model.step()
  323. _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim, config=config)
  324. def test_adamw_fp16_empty_grad(tmpdir):
  325. config_dict = {
  326. "train_batch_size": 1,
  327. "steps_per_print": 1,
  328. "fp16": {
  329. "enabled": True
  330. }
  331. }
  332. args = args_from_dict(tmpdir, config_dict)
  333. hidden_dim = 10
  334. model = SimpleModel(hidden_dim)
  335. @distributed_test(world_size=[1])
  336. def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
  337. optimizer = torch.optim.AdamW(params=model.parameters())
  338. model, _, _, _ = deepspeed.initialize(args=args,
  339. model=model,
  340. optimizer=optimizer)
  341. data_loader = random_dataloader(model=model,
  342. total_samples=50,
  343. hidden_dim=hidden_dim,
  344. device=model.device)
  345. for n, batch in enumerate(data_loader):
  346. loss = model(batch[0], batch[1])
  347. model.backward(loss)
  348. model.step()
  349. _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
  350. @pytest.mark.parametrize('zero_stage, use_cpu_offload',
  351. [(1,
  352. False),
  353. (2,
  354. False),
  355. (2,
  356. True),
  357. (3,
  358. False),
  359. (3,
  360. True)])
  361. def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
  362. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  363. pytest.skip("cpu-adam is not compatible")
  364. config_dict = {
  365. "train_batch_size": 1,
  366. "steps_per_print": 1,
  367. "optimizer": {
  368. "type": "Adam",
  369. "params": {
  370. "lr": 0.00015
  371. }
  372. },
  373. "scheduler": {
  374. "type": "OneCycle",
  375. "params": {
  376. "cycle_first_step_size": 16000,
  377. "cycle_first_stair_count": 8000,
  378. "decay_step_size": 16000,
  379. "cycle_min_lr": 1e-06,
  380. "cycle_max_lr": 3e-05,
  381. "decay_lr_rate": 1e-07,
  382. "cycle_min_mom": 0.85,
  383. "cycle_max_mom": 0.99,
  384. "decay_mom_rate": 0.0
  385. }
  386. },
  387. "fp16": {
  388. "enabled": True
  389. },
  390. "zero_optimization": {
  391. "stage": zero_stage,
  392. "cpu_offload": use_cpu_offload
  393. }
  394. }
  395. args = args_from_dict(tmpdir, config_dict)
  396. hidden_dim = 10
  397. @distributed_test(world_size=[1])
  398. def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim):
  399. model = SimpleModel(hidden_dim)
  400. model, _, _,_ = deepspeed.initialize(args=args,
  401. model=model,
  402. model_parameters=model.parameters())
  403. data_loader = random_dataloader(model=model,
  404. total_samples=50,
  405. hidden_dim=hidden_dim,
  406. device=model.device)
  407. for n, batch in enumerate(data_loader):
  408. loss = model(batch[0], batch[1])
  409. model.backward(loss)
  410. model.step()
  411. _test_adam_fp16_zero_onecycle_compatibility(args=args,
  412. zero_stage=zero_stage,
  413. hidden_dim=hidden_dim)
  414. @pytest.mark.parametrize('zero_stage, use_cpu_offload',
  415. [(1,
  416. False),
  417. (2,
  418. False),
  419. (2,
  420. True),
  421. (3,
  422. False),
  423. (3,
  424. True)])
  425. def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
  426. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  427. pytest.skip("cpu-adam is not compatible")
  428. config_dict = {
  429. "train_batch_size": 4,
  430. "steps_per_print": 1,
  431. "optimizer": {
  432. "type": "Adam",
  433. "params": {
  434. "lr": 0.00015
  435. }
  436. },
  437. "fp16": {
  438. "enabled": True,
  439. "loss_scale": 138.
  440. },
  441. "zero_optimization": {
  442. "stage": zero_stage,
  443. "cpu_offload": use_cpu_offload
  444. }
  445. }
  446. args = args_from_dict(tmpdir, config_dict)
  447. @distributed_test(world_size=2)
  448. def _test_zero_static_scale(args, zero_stage, hidden_dim):
  449. #making hidden size not divisible by DP for covering this scenario
  450. hidden_dim = hidden_dim
  451. model = SimpleModel(hidden_dim)
  452. model, optim, _, _ = deepspeed.initialize(args=args,
  453. model=model,
  454. model_parameters=model.parameters())
  455. # Ensure the static scaler is configured.
  456. assert optim.dynamic_loss_scale == False
  457. assert optim.loss_scaler.loss_scale == 138.
  458. # Now make sure things work..
  459. data_loader = random_dataloader(model=model,
  460. total_samples=10,
  461. hidden_dim=hidden_dim,
  462. device=model.device)
  463. for n, batch in enumerate(data_loader):
  464. loss = model(batch[0], batch[1])
  465. model.backward(loss)
  466. model.step()
  467. #test when hidden_dim is not aligned with world size
  468. _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9)
  469. #test when hidden_dim is aligned with world size
  470. _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10)
  471. def test_zero_static_scale_deprecated_format(tmpdir):
  472. config_dict = {
  473. "train_batch_size": 4,
  474. "steps_per_print": 1,
  475. "optimizer": {
  476. "type": "Adam",
  477. "params": {
  478. "lr": 0.00015
  479. }
  480. },
  481. "fp16": {
  482. "enabled": True,
  483. "loss_scale": 138.
  484. },
  485. "zero_optimization": {
  486. "stage": 1
  487. }
  488. }
  489. args = args_from_dict(tmpdir, config_dict)
  490. @distributed_test(world_size=2)
  491. def _test_zero_static_scale(args):
  492. hidden_dim = 10
  493. model = SimpleModel(hidden_dim)
  494. model, optim, _, _ = deepspeed.initialize(args=args,
  495. model=model,
  496. model_parameters=model.parameters())
  497. # Ensure the static scaler is configured.
  498. assert optim.dynamic_loss_scale == False
  499. assert optim.loss_scaler.loss_scale == 138.
  500. # Now make sure things work..
  501. data_loader = random_dataloader(model=model,
  502. total_samples=10,
  503. hidden_dim=hidden_dim,
  504. device=model.device)
  505. for n, batch in enumerate(data_loader):
  506. loss = model(batch[0], batch[1])
  507. model.backward(loss)
  508. model.step()
  509. _test_zero_static_scale(args)
  510. @pytest.mark.parametrize('zero_stage, use_cpu_offload',
  511. [(1,
  512. False),
  513. (2,
  514. False),
  515. (2,
  516. True),
  517. (3,
  518. False),
  519. (3,
  520. True)])
  521. def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
  522. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  523. pytest.skip("cpu-adam is not compatible")
  524. config_dict = {
  525. "train_batch_size": 4,
  526. "steps_per_print": 1,
  527. "fp16": {
  528. "enabled": True,
  529. },
  530. "zero_optimization": {
  531. "stage": zero_stage,
  532. "cpu_offload": use_cpu_offload
  533. },
  534. "zero_allow_untested_optimizer": False
  535. }
  536. args = args_from_dict(tmpdir, config_dict)
  537. @distributed_test(world_size=[1])
  538. def _test_zero_allow_untested_optimizer(args, zero_stage):
  539. hidden_dim = 10
  540. model = SimpleModel(hidden_dim)
  541. optimizer = SimpleOptimizer(model.parameters())
  542. with pytest.raises(AssertionError):
  543. model, optim, _, _ = deepspeed.initialize(args=args,
  544. model=model,
  545. optimizer=optimizer,
  546. model_parameters=model.parameters())
  547. _test_zero_allow_untested_optimizer(args, zero_stage)
  548. @pytest.mark.parametrize('zero_stage, use_cpu_offload',
  549. [(1,
  550. False),
  551. (2,
  552. False),
  553. (2,
  554. True),
  555. (3,
  556. False),
  557. (3,
  558. True)])
  559. def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
  560. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  561. pytest.skip("cpu-adam is not compatible")
  562. if zero_stage == 3:
  563. pytest.skip("skip for now")
  564. config_dict = {
  565. "train_micro_batch_size_per_gpu": 1,
  566. "gradient_accumulation_steps": 1,
  567. "fp16": {
  568. "enabled": True,
  569. "initial_scale_power": 8
  570. },
  571. "optimizer": {
  572. "type": "Adam",
  573. "params": {
  574. "lr": 0.00015
  575. }
  576. },
  577. "zero_optimization": {
  578. "stage": zero_stage,
  579. "cpu_offload": use_cpu_offload,
  580. "reduce_bucket_size": 100,
  581. "allgather_bucket_size": 100
  582. }
  583. }
  584. args = args_from_dict(tmpdir, config_dict)
  585. @distributed_test(world_size=[3])
  586. def _test_zero_empty_partition(args, zero_stage):
  587. hidden_dim = 1
  588. model = SimpleModel(hidden_dim)
  589. # Ensure model has 2 parameters, to cause empty partition with DP=3
  590. assert len(list(model.parameters())) == 2
  591. model, _, _, _ = deepspeed.initialize(args=args,
  592. model=model,
  593. model_parameters=model.parameters())
  594. # Now make sure things work..
  595. data_loader = random_dataloader(model=model,
  596. total_samples=1,
  597. hidden_dim=hidden_dim,
  598. device=model.device)
  599. for n, batch in enumerate(data_loader):
  600. loss = model(batch[0], batch[1])
  601. model.backward(loss)
  602. model.step()
  603. _test_zero_empty_partition(args=args, zero_stage=zero_stage)
  604. @amp_available
  605. def test_adam_amp_basic(tmpdir):
  606. config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
  607. args = args_from_dict(tmpdir, config_dict)
  608. hidden_dim = 10
  609. model = SimpleModel(hidden_dim)
  610. @distributed_test(world_size=[1])
  611. def _test_adam_amp_basic(args, model, hidden_dim):
  612. optimizer = torch.optim.Adam(params=model.parameters())
  613. model, _, _, _ = deepspeed.initialize(args=args,
  614. model=model,
  615. optimizer=optimizer)
  616. data_loader = random_dataloader(model=model,
  617. total_samples=50,
  618. hidden_dim=hidden_dim,
  619. device=model.device)
  620. for n, batch in enumerate(data_loader):
  621. loss = model(batch[0], batch[1])
  622. model.backward(loss)
  623. model.step()
  624. _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
  625. @amp_available
  626. def test_lamb_amp_basic(tmpdir):
  627. config_dict = {
  628. "train_batch_size": 2,
  629. "steps_per_print": 1,
  630. "optimizer": {
  631. "type": "Lamb",
  632. "params": {
  633. "lr": 0.00015
  634. }
  635. },
  636. "gradient_clipping": 1.0,
  637. "amp": {
  638. "enabled": True,
  639. }
  640. }
  641. args = args_from_dict(tmpdir, config_dict)
  642. hidden_dim = 10
  643. model = SimpleModel(hidden_dim)
  644. @distributed_test(world_size=[1, 2])
  645. def _test_lamb_amp_basic(args, model, hidden_dim):
  646. model, _, _, _ = deepspeed.initialize(args=args,
  647. model=model,
  648. model_parameters=model.parameters())
  649. data_loader = random_dataloader(model=model,
  650. total_samples=50,
  651. hidden_dim=hidden_dim,
  652. device=model.device)
  653. for n, batch in enumerate(data_loader):
  654. loss = model(batch[0], batch[1])
  655. model.backward(loss)
  656. model.step()
  657. _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
  658. @amp_available
  659. def test_adam_amp_o2(tmpdir):
  660. config_dict = {
  661. "train_batch_size": 2,
  662. "steps_per_print": 1,
  663. "optimizer": {
  664. "type": "Adam",
  665. "params": {
  666. "lr": 0.00015
  667. }
  668. },
  669. "gradient_clipping": 1.0,
  670. "amp": {
  671. "enabled": True,
  672. "opt_level": "O2"
  673. }
  674. }
  675. args = args_from_dict(tmpdir, config_dict)
  676. hidden_dim = 10
  677. model = SimpleModel(hidden_dim)
  678. @distributed_test(world_size=[1, 2])
  679. def _test_adam_amp_o2(args, model, hidden_dim):
  680. model, _, _, _ = deepspeed.initialize(args=args,
  681. model=model,
  682. model_parameters=model.parameters())
  683. data_loader = random_dataloader(model=model,
  684. total_samples=50,
  685. hidden_dim=hidden_dim,
  686. device=model.device)
  687. for n, batch in enumerate(data_loader):
  688. loss = model(batch[0], batch[1])
  689. model.backward(loss)
  690. model.step()
  691. _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
  692. @amp_available
  693. def test_adam_amp_o2_empty_grad(tmpdir):
  694. config_dict = {
  695. "train_batch_size": 2,
  696. "steps_per_print": 1,
  697. "optimizer": {
  698. "type": "Adam",
  699. "params": {
  700. "lr": 0.00015
  701. }
  702. },
  703. "gradient_clipping": 1.0,
  704. "amp": {
  705. "enabled": True,
  706. "opt_level": "O2"
  707. }
  708. }
  709. args = args_from_dict(tmpdir, config_dict)
  710. hidden_dim = 10
  711. model = SimpleModel(hidden_dim)
  712. @distributed_test(world_size=[2])
  713. def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
  714. model, _, _, _ = deepspeed.initialize(args=args,
  715. model=model,
  716. model_parameters=model.parameters())
  717. data_loader = random_dataloader(model=model,
  718. total_samples=50,
  719. hidden_dim=hidden_dim,
  720. device=model.device)
  721. for n, batch in enumerate(data_loader):
  722. loss = model(batch[0], batch[1])
  723. model.backward(loss)
  724. model.step()
  725. _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
  726. @pytest.mark.parametrize('zero_stage, optimizer_constructor',
  727. [(1,
  728. FusedAdam),
  729. (2,
  730. torch.optim.Adam),
  731. (2,
  732. FusedAdam),
  733. (3,
  734. torch.optim.Adam),
  735. (3,
  736. FusedAdam)])
  737. def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
  738. config_dict = {
  739. "train_batch_size": 2,
  740. "steps_per_print": 1,
  741. "fp16": {
  742. "enabled": True
  743. },
  744. "zero_optimization": {
  745. "stage": zero_stage
  746. }
  747. }
  748. args = args_from_dict(tmpdir, config_dict)
  749. hidden_dim = 10
  750. @distributed_test(world_size=[1])
  751. def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
  752. model = SimpleModel(hidden_dim)
  753. client_optimizer = optimizer_constructor(params=model.parameters())
  754. model, _, _, _ = deepspeed.initialize(args=args,
  755. model=model,
  756. optimizer=client_optimizer)
  757. _test_zero_supported_client_optimizer(args=args,
  758. zero_stage=zero_stage,
  759. optimizer_constructor=optimizer_constructor)
  760. def test_zero2_reduce_scatter_off(tmpdir):
  761. config_dict = {
  762. "train_batch_size": 2,
  763. "steps_per_print": 1,
  764. "optimizer": {
  765. "type": "Adam",
  766. "params": {
  767. "lr": 0.00015
  768. }
  769. },
  770. "gradient_clipping": 1.0,
  771. "zero_optimization": {
  772. "stage": 2,
  773. "contiguous_gradients": True,
  774. "allgather_bucket_size": 2000000000,
  775. "reduce_bucket_size": 200000000,
  776. "overlap_comm": False,
  777. "reduce_scatter": False
  778. },
  779. "fp16": {
  780. "enabled": True
  781. }
  782. }
  783. args = args_from_dict(tmpdir, config_dict)
  784. hidden_dim = 10
  785. model = SimpleModel(hidden_dim)
  786. @distributed_test(world_size=[2])
  787. def _helper(args, model, hidden_dim):
  788. model, _, _, _ = deepspeed.initialize(args=args,
  789. model=model,
  790. model_parameters=model.parameters())
  791. data_loader = random_dataloader(model=model,
  792. total_samples=50,
  793. hidden_dim=hidden_dim,
  794. device=model.device)
  795. for n, batch in enumerate(data_loader):
  796. loss = model(batch[0], batch[1])
  797. model.backward(loss)
  798. model.step()
  799. _helper(args=args, model=model, hidden_dim=hidden_dim)
  800. @pytest.mark.parametrize('adam_type, torch_impl',
  801. [('Adam',
  802. True),
  803. ('Adam',
  804. False),
  805. ('AdamW',
  806. True),
  807. ('AdamW',
  808. False)])
  809. def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
  810. config_dict = {
  811. "train_batch_size": 1,
  812. "steps_per_print": 1,
  813. "fp16": {
  814. "enabled": True,
  815. "initial_scale_power": 10
  816. },
  817. "optimizer": {
  818. "type": adam_type,
  819. "torch_adam": torch_impl,
  820. "params": {
  821. "lr": 0.00015
  822. }
  823. }
  824. }
  825. args = args_from_dict(tmpdir, config_dict)
  826. hidden_dim = 10
  827. model = SimpleModel(hidden_dim)
  828. @distributed_test(world_size=[1])
  829. def _test_fp16_adam_types(args, model, hidden_dim):
  830. model, _, _, _ = deepspeed.initialize(args=args,
  831. model=model,
  832. model_parameters=model.parameters())
  833. data_loader = random_dataloader(model=model,
  834. total_samples=10,
  835. hidden_dim=hidden_dim,
  836. device=model.device)
  837. for _, batch in enumerate(data_loader):
  838. loss = model(batch[0], batch[1])
  839. model.backward(loss)
  840. model.step()
  841. _test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)
  842. def test_zero3_lazyscatter(tmpdir):
  843. config_dict = {
  844. "train_batch_size": 1,
  845. "steps_per_print": 1,
  846. "fp16": {
  847. "enabled": True,
  848. "initial_scale_power": 10
  849. },
  850. "optimizer": {
  851. "type": "AdamW",
  852. "params": {
  853. "lr": 0.00015
  854. }
  855. },
  856. "zero_optimization": {
  857. "stage": 3
  858. }
  859. }
  860. args = args_from_dict(tmpdir, config_dict)
  861. hidden_dim = 10
  862. @distributed_test(world_size=[1])
  863. def _go(args):
  864. model = SimpleModel(hidden_dim)
  865. model, _, _, _ = deepspeed.initialize(args=args,
  866. model=model,
  867. model_parameters=model.parameters())
  868. data_loader = random_dataloader(model=model,
  869. total_samples=10,
  870. hidden_dim=hidden_dim,
  871. device=model.device)
  872. for _, batch in enumerate(data_loader):
  873. loss = model(batch[0], batch[1])
  874. model.backward(loss)
  875. model.step()
  876. _go(args=args)
  877. @pytest.mark.parametrize('stage', [1, 2, 3])
  878. def test_zero_empty_grad(tmpdir, stage):
  879. config_dict = {
  880. "train_batch_size": 1,
  881. "steps_per_print": 1,
  882. "fp16": {
  883. "enabled": True
  884. },
  885. "zero_optimization": {
  886. "stage": stage
  887. }
  888. }
  889. args = args_from_dict(tmpdir, config_dict)
  890. hidden_dim = 10
  891. model = SimpleModel(hidden_dim)
  892. @distributed_test(world_size=[1])
  893. def _go(args, model, hidden_dim):
  894. optimizer = torch.optim.Adam(model.parameters())
  895. model, _, _, _ = deepspeed.initialize(args=args,
  896. model=model,
  897. optimizer=optimizer)
  898. data_loader = random_dataloader(model=model,
  899. total_samples=50,
  900. hidden_dim=hidden_dim,
  901. device=model.device)
  902. for n, batch in enumerate(data_loader):
  903. loss = model(batch[0], batch[1])
  904. model.backward(loss)
  905. model.step()
  906. _go(args=args, model=model, hidden_dim=hidden_dim)