run_func_test.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. # coding=utf-8
  2. # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
  3. #
  4. # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
  5. import unittest
  6. import subprocess
  7. import os
  8. import time
  9. import re
  10. from .test_common import BaseTestCase
  11. LAYERS = 2
  12. HIDDEN_SIZE = 128
  13. ATTN_HEADS = 8
  14. SEQ_LEN = 64
  15. MASTER_PORT = 29700
  16. def grep_loss_from_file(file_name):
  17. loss = 0.0
  18. print(f'grepping {file_name}')
  19. with open(file_name, 'r') as f:
  20. lines = f.readlines()
  21. line_filter = "validation loss at the end of training for test data | LM loss:"
  22. match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
  23. for line in lines:
  24. if line_filter in line:
  25. loss = re.findall(match_number, line)
  26. loss = float(loss[0])
  27. if loss == 0.0:
  28. print("no loss found in file ", file_name)
  29. return loss
  30. class GPT2FuncTestCase(BaseTestCase):
  31. def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
  32. super(GPT2FuncTestCase, self).__init__(methodName)
  33. def setUp(self):
  34. self.save_dir = os.getcwd()
  35. new_dir = os.path.dirname(__file__)
  36. if new_dir:
  37. os.chdir(new_dir)
  38. def tearDown(self):
  39. os.chdir(self.save_dir)
  40. def test_mp1_gpu2_node1_fp16(self):
  41. test_config = {
  42. "mp": 1,
  43. "gpus": 2,
  44. "nodes": 1,
  45. "bs": 8,
  46. "steps": 1000,
  47. "layers": LAYERS,
  48. "hidden_size": HIDDEN_SIZE,
  49. "seq_length": SEQ_LEN,
  50. "heads": ATTN_HEADS,
  51. "deepspeed": False,
  52. "json": "ds_config_func_bs8_no_zero.json",
  53. }
  54. succ = self.run_test(test_config, 0.01)
  55. self.assertTrue(succ)
  56. def test_mp1_gpu1_node1_zero1(self):
  57. test_config = {
  58. "mp": 1,
  59. "gpus": 1,
  60. "nodes": 1,
  61. "bs": 4,
  62. "steps": 1000,
  63. "layers": LAYERS,
  64. "hidden_size": HIDDEN_SIZE,
  65. "seq_length": SEQ_LEN,
  66. "heads": ATTN_HEADS,
  67. "deepspeed": False,
  68. "json": "ds_config_func_bs4_zero1.json",
  69. }
  70. succ = self.run_test(test_config, 0.01)
  71. self.assertTrue(succ)
  72. def test_mp1_gpu2_node1_zero1(self):
  73. test_config = {
  74. "mp": 1,
  75. "gpus": 2,
  76. "nodes": 1,
  77. "bs": 8,
  78. "steps": 1000,
  79. "layers": LAYERS,
  80. "hidden_size": HIDDEN_SIZE,
  81. "seq_length": SEQ_LEN,
  82. "heads": ATTN_HEADS,
  83. "deepspeed": False,
  84. "json": "ds_config_func_bs8_zero1.json",
  85. }
  86. succ = self.run_test(test_config, 0.01)
  87. self.assertTrue(succ)
  88. def test_mp2_gpu4_node1_zero1(self):
  89. test_config = {
  90. "mp": 2,
  91. "gpus": 4,
  92. "nodes": 1,
  93. "bs": 8,
  94. "steps": 1000,
  95. "layers": LAYERS,
  96. "hidden_size": HIDDEN_SIZE,
  97. "seq_length": SEQ_LEN,
  98. "heads": ATTN_HEADS,
  99. "deepspeed": False,
  100. "json": "ds_config_func_bs8_zero1.json",
  101. }
  102. succ = self.run_test(test_config, 0.01)
  103. self.assertTrue(succ)
  104. def test_mp4_gpu4_node1_zero1(self):
  105. test_config = {
  106. "mp": 4,
  107. "gpus": 4,
  108. "nodes": 1,
  109. "bs": 8,
  110. "steps": 1000,
  111. "layers": LAYERS,
  112. "hidden_size": HIDDEN_SIZE,
  113. "seq_length": SEQ_LEN,
  114. "heads": ATTN_HEADS,
  115. "deepspeed": False,
  116. "json": "ds_config_func_bs8_zero1.json",
  117. }
  118. succ = self.run_test(test_config, 0.01)
  119. self.assertTrue(succ)
  120. def test_mp1_gpu1_node1_zero2(self):
  121. test_config = {
  122. "mp": 1,
  123. "gpus": 1,
  124. "nodes": 1,
  125. "bs": 4,
  126. "steps": 1000,
  127. "layers": LAYERS,
  128. "hidden_size": HIDDEN_SIZE,
  129. "seq_length": SEQ_LEN,
  130. "heads": ATTN_HEADS,
  131. "deepspeed": False,
  132. "json": "ds_config_func_bs4_zero2.json",
  133. }
  134. succ = self.run_test(test_config, 0.01)
  135. self.assertTrue(succ)
  136. def test_mp1_gpu2_node1_zero2(self):
  137. test_config = {
  138. "mp": 1,
  139. "gpus": 2,
  140. "nodes": 1,
  141. "bs": 8,
  142. "steps": 1000,
  143. "layers": LAYERS,
  144. "hidden_size": HIDDEN_SIZE,
  145. "seq_length": SEQ_LEN,
  146. "heads": ATTN_HEADS,
  147. "deepspeed": False,
  148. "json": "ds_config_func_bs8_zero2.json",
  149. }
  150. succ = self.run_test(test_config, 0.01)
  151. self.assertTrue(succ)
  152. def test_mp2_gpu4_node1_zero2(self):
  153. test_config = {
  154. "mp": 2,
  155. "gpus": 4,
  156. "nodes": 1,
  157. "bs": 8,
  158. "steps": 1000,
  159. "layers": LAYERS,
  160. "hidden_size": HIDDEN_SIZE,
  161. "seq_length": SEQ_LEN,
  162. "heads": ATTN_HEADS,
  163. "deepspeed": False,
  164. "json": "ds_config_func_bs8_zero2.json",
  165. }
  166. basic_run_config = test_config
  167. succ = self.run_test(basic_run_config, 0.01)
  168. self.assertTrue(succ)
  169. partition_activation_config = test_config
  170. succ = self.run_partition_activations_test(partition_activation_config, 0.01)
  171. self.assertTrue(succ)
  172. def test_mp4_gpu4_node1_zero2(self):
  173. test_config = {
  174. "mp": 4,
  175. "gpus": 4,
  176. "nodes": 1,
  177. "bs": 8,
  178. "steps": 1000,
  179. "layers": LAYERS,
  180. "hidden_size": HIDDEN_SIZE,
  181. "seq_length": SEQ_LEN,
  182. "heads": ATTN_HEADS,
  183. "deepspeed": False,
  184. "json": "ds_config_func_bs8_zero2.json",
  185. }
  186. basic_run_config = test_config
  187. succ = self.run_test(basic_run_config, 0.01)
  188. self.assertTrue(succ)
  189. partition_activation_config = test_config
  190. succ = self.run_partition_activations_test(partition_activation_config, 0.01)
  191. self.assertTrue(succ)
  192. def test_mp1_gpu1_node1_zero2_ds_offload(self):
  193. test_config = {
  194. "mp": 1,
  195. "gpus": 1,
  196. "nodes": 1,
  197. "bs": 4,
  198. "steps": 1000,
  199. "layers": LAYERS,
  200. "hidden_size": HIDDEN_SIZE,
  201. "seq_length": SEQ_LEN,
  202. "heads": ATTN_HEADS,
  203. "deepspeed": False,
  204. "json": "ds_config_func_bs4_zero2_offload.json",
  205. "cpu_optimizer": True,
  206. }
  207. succ = self.run_test(test_config, 0.02)
  208. self.assertTrue(succ)
  209. def test_mp1_gpu2_node1_zero2_ds_offload(self):
  210. test_config = {
  211. "mp": 1,
  212. "gpus": 2,
  213. "nodes": 1,
  214. "bs": 8,
  215. "steps": 1000,
  216. "layers": LAYERS,
  217. "hidden_size": HIDDEN_SIZE,
  218. "seq_length": SEQ_LEN,
  219. "heads": ATTN_HEADS,
  220. "deepspeed": False,
  221. "json": "ds_config_func_bs8_zero2_offload.json",
  222. "cpu_optimizer": True,
  223. }
  224. succ = self.run_test(test_config, 0.02)
  225. self.assertTrue(succ)
  226. def test_mp2_gpu4_node1_zero2_gas(self):
  227. test_config = {
  228. "mp": 2,
  229. "gpus": 4,
  230. "nodes": 1,
  231. "bs": 8,
  232. "steps": 1000,
  233. "layers": LAYERS,
  234. "hidden_size": HIDDEN_SIZE,
  235. "seq_length": SEQ_LEN,
  236. "heads": ATTN_HEADS,
  237. "deepspeed": True,
  238. "json": "ds_config_func_bs8_zero2_gas3.json",
  239. "baseline": "ds_config_func_bs8_zero0_gas3.json",
  240. }
  241. succ = self.run_test(test_config, 0.01)
  242. self.assertTrue(succ)
  243. succ = self.run_partition_activations_test(test_config, 0.01)
  244. self.assertTrue(succ)
  245. def test_mp2_gpu4_node1_zero2_ds_offload(self):
  246. test_config = {
  247. "mp": 2,
  248. "gpus": 4,
  249. "nodes": 1,
  250. "bs": 8,
  251. "steps": 1000,
  252. "layers": LAYERS,
  253. "hidden_size": HIDDEN_SIZE,
  254. "seq_length": SEQ_LEN,
  255. "heads": ATTN_HEADS,
  256. "deepspeed": False,
  257. "json": "ds_config_func_bs8_zero2_offload.json",
  258. "cpu_optimizer": True,
  259. }
  260. basic_run_config = test_config
  261. succ = self.run_test(basic_run_config, 0.02)
  262. self.assertTrue(succ)
  263. partition_activation_config = test_config
  264. succ = self.run_partition_activations_test(partition_activation_config, 0.02)
  265. self.assertTrue(succ)
  266. def test_mp4_gpu4_node1_zero2_ds_offload(self):
  267. test_config = {
  268. "mp": 4,
  269. "gpus": 4,
  270. "nodes": 1,
  271. "bs": 8,
  272. "steps": 1000,
  273. "layers": LAYERS,
  274. "hidden_size": HIDDEN_SIZE,
  275. "seq_length": SEQ_LEN,
  276. "heads": ATTN_HEADS,
  277. "deepspeed": False,
  278. "json": "ds_config_func_bs8_zero2_offload.json",
  279. "cpu_optimizer": True,
  280. }
  281. basic_run_config = test_config
  282. succ = self.run_test(basic_run_config, 0.02)
  283. self.assertTrue(succ)
  284. partition_activation_config = test_config
  285. succ = self.run_partition_activations_test(partition_activation_config, 0.02)
  286. self.assertTrue(succ)
  287. def test_mp1_gpu1_node1_zero2_torch_offload(self):
  288. test_config = {
  289. "mp": 1,
  290. "gpus": 1,
  291. "nodes": 1,
  292. "bs": 4,
  293. "steps": 1000,
  294. "layers": LAYERS,
  295. "hidden_size": HIDDEN_SIZE,
  296. "seq_length": SEQ_LEN,
  297. "heads": ATTN_HEADS,
  298. "deepspeed": False,
  299. "json": "ds_config_func_bs4_zero2_offload.json",
  300. "cpu_optimizer": True,
  301. "test_torch_offload": True,
  302. }
  303. succ = self.run_test(test_config, 0.01)
  304. self.assertTrue(succ)
  305. def test_mp1_gpu2_node1_zero2_torch_offload(self):
  306. test_config = {
  307. "mp": 1,
  308. "gpus": 2,
  309. "nodes": 1,
  310. "bs": 8,
  311. "steps": 1000,
  312. "layers": LAYERS,
  313. "hidden_size": HIDDEN_SIZE,
  314. "seq_length": SEQ_LEN,
  315. "heads": ATTN_HEADS,
  316. "deepspeed": False,
  317. "json": "ds_config_func_bs8_zero2_offload.json",
  318. "cpu_optimizer": True,
  319. "test_torch_offload": True,
  320. }
  321. succ = self.run_test(test_config, 0.01)
  322. self.assertTrue(succ)
  323. def test_mp2_gpu4_node1_zero2_torch_offload(self):
  324. test_config = {
  325. "mp": 2,
  326. "gpus": 4,
  327. "nodes": 1,
  328. "bs": 8,
  329. "steps": 1000,
  330. "layers": LAYERS,
  331. "hidden_size": HIDDEN_SIZE,
  332. "seq_length": SEQ_LEN,
  333. "heads": ATTN_HEADS,
  334. "deepspeed": False,
  335. "json": "ds_config_func_bs8_zero2_offload.json",
  336. "cpu_optimizer": True,
  337. "test_torch_offload": True,
  338. }
  339. basic_run_config = test_config
  340. succ = self.run_test(basic_run_config, 0.01)
  341. self.assertTrue(succ)
  342. partition_activation_config = test_config
  343. succ = self.run_partition_activations_test(partition_activation_config, 0.01)
  344. self.assertTrue(succ)
  345. def test_mp4_gpu4_node1_zero2_torch_offload(self):
  346. test_config = {
  347. "mp": 4,
  348. "gpus": 4,
  349. "nodes": 1,
  350. "bs": 8,
  351. "steps": 1000,
  352. "layers": LAYERS,
  353. "hidden_size": HIDDEN_SIZE,
  354. "seq_length": SEQ_LEN,
  355. "heads": ATTN_HEADS,
  356. "deepspeed": False,
  357. "json": "ds_config_func_bs8_zero2_offload.json",
  358. "cpu_optimizer": True,
  359. "test_torch_offload": True,
  360. }
  361. basic_run_config = test_config
  362. succ = self.run_test(basic_run_config, 0.01)
  363. self.assertTrue(succ)
  364. partition_activation_config = test_config
  365. succ = self.run_partition_activations_test(partition_activation_config, 0.01)
  366. def test_optimizer_scheduler(self):
  367. test_config = {
  368. "mp": 1,
  369. "gpus": 1,
  370. "nodes": 1,
  371. "bs": 4,
  372. "steps": 20,
  373. "layers": LAYERS,
  374. "hidden_size": HIDDEN_SIZE,
  375. "seq_length": SEQ_LEN,
  376. "heads": ATTN_HEADS,
  377. "deepspeed": False,
  378. "json": "ds_config_func_scheduler.json",
  379. }
  380. succ = self.run_test(test_config, 0.01)
  381. # assure no crash.
  382. self.assertTrue(True)
  383. def run_partition_activations_test(self, test_config, r_tol):
  384. print("\n")
  385. print("{0}: starting......".format(self.id()))
  386. baseline_prefix = "gpt2_func_"
  387. prefix = "gpt2_partition_activation_"
  388. deepspeed_config = test_config["json"]
  389. baseline_deepspeed_config = False
  390. cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
  391. # baseline run...
  392. # turnoff deepspeed if baseline deepspeed config
  393. # is not provided
  394. if not "baseline" in test_config:
  395. test_config["deepspeed"] = False
  396. else:
  397. test_config["json"] = test_config["baseline"]
  398. baseline_prefix += test_config["json"][0:-5]
  399. baseline_deepspeed_config = True
  400. test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
  401. base_file = self.gen_output_name(test_config,
  402. baseline_prefix,
  403. baseline_config=baseline_deepspeed_config)
  404. # skip baseline run if it exists.
  405. if not self.has_loss_data(base_file):
  406. print("{0}: baseline run.".format(self.id()))
  407. self.run_gpt2_test(test_config, base_file)
  408. else:
  409. print("{0}: baseline exists.".format(self.id()))
  410. # DeepSpeed run...
  411. test_config["deepspeed"] = True
  412. cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
  413. test_config[
  414. "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
  415. test_config["json"] = deepspeed_config
  416. print("{0}: DeepSpeed run.".format(self.id()))
  417. test_file = self.gen_output_name(test_config, prefix)
  418. self.run_gpt2_test(test_config, test_file)
  419. return self.check_parity(base_file, test_file, r_tol)
  420. def run_test(self, test_config, r_tol):
  421. print("\n")
  422. print("{0}: starting......".format(self.id()))
  423. prefix = "gpt2_func"
  424. baseline_prefix = prefix
  425. deepspeed_config = test_config["json"]
  426. baseline_deepspeed_config = False
  427. cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
  428. # baseline run...
  429. # turn off deepspeed if a baseline deepspeed config
  430. # is not provided
  431. if not "baseline" in test_config:
  432. test_config["deepspeed"] = False
  433. else:
  434. test_config["json"] = test_config["baseline"]
  435. baseline_prefix = prefix + test_config["json"][0:-5]
  436. baseline_deepspeed_config = True
  437. test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
  438. # baseline run...
  439. base_file = self.gen_output_name(test_config,
  440. baseline_prefix,
  441. baseline_config=baseline_deepspeed_config)
  442. # skip baseline run if it exists.
  443. if not self.has_loss_data(base_file):
  444. print("{0}: baseline run.".format(self.id()))
  445. self.run_gpt2_test(test_config, base_file)
  446. else:
  447. print("{0}: baseline exists.".format(self.id()))
  448. # DeepSpeed run...
  449. test_config["deepspeed"] = True
  450. cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
  451. test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
  452. print("{0}: DeepSpeed run.".format(self.id()))
  453. test_file = self.gen_output_name(test_config, prefix)
  454. self.run_gpt2_test(test_config, test_file)
  455. return self.check_parity(base_file, test_file, r_tol)
  456. def has_loss_data(self, file_name):
  457. has_loss = False
  458. if os.path.exists(file_name):
  459. loss = grep_loss_from_file(file_name)
  460. if loss != 0.0:
  461. has_loss = True
  462. return has_loss
  463. def check_parity(self, base_file, test_file, r_tol):
  464. base_loss = grep_loss_from_file(base_file)
  465. test_loss = grep_loss_from_file(test_file)
  466. print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
  467. if base_loss == 0.0 or test_loss == 0.0:
  468. return False
  469. if abs((base_loss - test_loss) / base_loss) > r_tol:
  470. return False
  471. return True
  472. def gen_cpu_optimizer_flag(self, test_config, is_baseline):
  473. if 'cpu_optimizer' in test_config and test_config['cpu_optimizer']:
  474. cpu_optimizer_flag = "--cpu-optimizer"
  475. if is_baseline:
  476. cpu_optimizer_flag += " --cpu_torch_adam"
  477. return cpu_optimizer_flag
  478. if 'test_torch_offload' in test_config and test_config['test_torch_offload']:
  479. cpu_optimizer_flag += " --cpu_torch_adam"
  480. return cpu_optimizer_flag
  481. else:
  482. cpu_optimizer_flag = ""
  483. return cpu_optimizer_flag
  484. def suite():
  485. suite = unittest.TestSuite()
  486. suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
  487. # Baseline = Megatron + Torch.Optim.Adam
  488. # Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
  489. suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_torch_offload'))
  490. suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_torch_offload'))
  491. suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_torch_offload'))
  492. suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_torch_offload'))
  493. # Baseline = Megatron + Torch.Optim.Adam
  494. # Test = Megatron + DeepSpeedAdam + ZeRO-Offload
  495. suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_ds_offload'))
  496. suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_ds_offload'))
  497. suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_ds_offload'))
  498. suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_ds_offload'))
  499. suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
  500. suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
  501. suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1'))
  502. suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero1'))
  503. suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2'))
  504. suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2'))
  505. suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2'))
  506. suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2'))
  507. suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_gas'))
  508. suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler'))
  509. return suite
  510. if __name__ == '__main__':
  511. runner = unittest.TextTestRunner(failfast=True)
  512. runner.run(suite())