run_checkpoint_test.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. """
  5. Note: please copy webtext data to "Megatron-LM" folder, before running this script.
  6. """
  7. import unittest
  8. import subprocess
  9. import os
  10. import re
  11. import shlex
  12. from .test_common import BaseTestCase
  13. LAYERS = 2
  14. HIDDEN_SIZE = 128
  15. ATTN_HEADS = 8
  16. def remove_file(test_id, filename):
  17. cmd = shlex.split(f"if [ -f {filename} ] ; then rm -v {filename}; fi")
  18. print(f"{test_id} cmd: {cmd}")
  19. subprocess.run(cmd, check=False, executable='/bin/bash')
  20. def grep_loss_from_file(file_name):
  21. loss = 0.0
  22. with open(file_name, 'r') as f:
  23. lines = f.readlines()
  24. line_filter = "validation loss at the end of training for test data | LM loss:"
  25. match_number = re.compile(r'LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
  26. for line in lines:
  27. if line_filter in line:
  28. loss = re.findall(match_number, line)
  29. loss = float(loss[0])
  30. if loss == 0.0:
  31. print("no loss found in file ", file_name)
  32. return loss
  33. class GPT2CheckpointTestCase(BaseTestCase):
  34. def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
  35. super(GPT2CheckpointTestCase, self).__init__(methodName)
  36. def setUp(self):
  37. self.save_dir = os.getcwd()
  38. new_dir = os.path.dirname(__file__)
  39. if new_dir:
  40. os.chdir(new_dir)
  41. def tearDown(self):
  42. os.chdir(self.save_dir)
  43. def test_mp2_gpu4_node1_with_zero1(self):
  44. test_config = {
  45. "mp": 2,
  46. "gpus": 4,
  47. "nodes": 1,
  48. "bs": 8,
  49. "steps": 1100,
  50. "layers": LAYERS,
  51. "hidden_size": HIDDEN_SIZE,
  52. "seq_length": 256,
  53. "heads": ATTN_HEADS,
  54. "deepspeed": True,
  55. "tag": "ds_zero1",
  56. "zero": True,
  57. "other_args": "",
  58. "checkpoint_name": "ckpt_mp2_gpu8_w_zero1",
  59. "checkpoint_interval": 1000,
  60. "json": "ds_config_func_bs8_zero1.json",
  61. }
  62. succ = self.run_test(test_config, 0.01)
  63. self.assertTrue(succ)
  64. def test_mp2_gpu4_node1_with_zero2(self):
  65. test_config = {
  66. "mp": 2,
  67. "gpus": 4,
  68. "nodes": 1,
  69. "bs": 8,
  70. "steps": 1100,
  71. "layers": LAYERS,
  72. "hidden_size": HIDDEN_SIZE,
  73. "seq_length": 256,
  74. "heads": ATTN_HEADS,
  75. "deepspeed": True,
  76. "tag": "ds_zero2",
  77. "zero": True,
  78. "other_args": "",
  79. "checkpoint_name": "ckpt_mp2_gpu8_w_zero2",
  80. "checkpoint_interval": 1000,
  81. "json": "ds_config_func_bs8_zero2.json",
  82. }
  83. succ = self.run_test(test_config, 0.01)
  84. self.assertTrue(succ)
  85. def test_mp2_gpu4_node1_with_zero2_offload(self):
  86. test_config = {
  87. "mp": 2,
  88. "gpus": 4,
  89. "nodes": 1,
  90. "bs": 8,
  91. "steps": 1100,
  92. "layers": LAYERS,
  93. "hidden_size": HIDDEN_SIZE,
  94. "seq_length": 256,
  95. "heads": ATTN_HEADS,
  96. "deepspeed": True,
  97. "tag": "ds_zero2_offload",
  98. "zero": True,
  99. "other_args": "",
  100. "checkpoint_name": "ckpt_mp2_gpu8_w_zero2_offload",
  101. "checkpoint_interval": 1000,
  102. "json": "ds_config_func_bs8_zero2_offload.json",
  103. "cpu_optimizer": True,
  104. }
  105. succ = self.run_test(test_config, 0.01)
  106. self.assertTrue(succ)
  107. def test_mp1_gpu2_load_gpu1_node1_with_zero1(self):
  108. test_config = {
  109. "mp": 1,
  110. "gpus": 2,
  111. "load_gpus": 1,
  112. "nodes": 1,
  113. "bs": 8,
  114. "steps": 1100,
  115. "layers": LAYERS,
  116. "hidden_size": HIDDEN_SIZE,
  117. "seq_length": 256,
  118. "heads": ATTN_HEADS,
  119. "deepspeed": True,
  120. "tag": "ds_zero1",
  121. "zero": True,
  122. "other_args": "",
  123. "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero1",
  124. "checkpoint_interval": 1000,
  125. "json": "ds_config_func_bs8_zero1.json",
  126. }
  127. succ = self.run_test(test_config, 0.01)
  128. self.assertTrue(succ)
  129. def test_mp1_gpu2_load_gpu4_node1_with_zero1(self):
  130. test_config = {
  131. "mp": 1,
  132. "gpus": 2,
  133. "load_gpus": 4,
  134. "nodes": 1,
  135. "bs": 8,
  136. "steps": 1100,
  137. "layers": LAYERS,
  138. "hidden_size": HIDDEN_SIZE,
  139. "seq_length": 256,
  140. "heads": ATTN_HEADS,
  141. "deepspeed": True,
  142. "tag": "ds_zero1",
  143. "zero": True,
  144. "other_args": "",
  145. "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero1",
  146. "checkpoint_interval": 1000,
  147. "json": "ds_config_func_bs8_zero1.json",
  148. }
  149. succ = self.run_test(test_config, 0.01)
  150. self.assertTrue(succ)
  151. def test_mp1_gpu2_load_gpu1_node1_with_zero2(self):
  152. test_config = {
  153. "mp": 1,
  154. "gpus": 2,
  155. "load_gpus": 1,
  156. "nodes": 1,
  157. "bs": 8,
  158. "steps": 1100,
  159. "layers": LAYERS,
  160. "hidden_size": HIDDEN_SIZE,
  161. "seq_length": 256,
  162. "heads": ATTN_HEADS,
  163. "deepspeed": True,
  164. "tag": "ds_zero2",
  165. "zero": True,
  166. "other_args": "",
  167. "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero2",
  168. "checkpoint_interval": 1000,
  169. "json": "ds_config_func_bs8_zero2.json",
  170. }
  171. succ = self.run_test(test_config, 0.01)
  172. self.assertTrue(succ)
  173. def test_mp1_gpu2_load_gpu1_node1_with_zero2_offload(self):
  174. test_config = {
  175. "mp": 1,
  176. "gpus": 2,
  177. "load_gpus": 1,
  178. "nodes": 1,
  179. "bs": 8,
  180. "steps": 1100,
  181. "layers": LAYERS,
  182. "hidden_size": HIDDEN_SIZE,
  183. "seq_length": 256,
  184. "heads": ATTN_HEADS,
  185. "deepspeed": True,
  186. "tag": "ds_zero2_offload",
  187. "zero": True,
  188. "other_args": "",
  189. "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero2_offload",
  190. "checkpoint_interval": 1000,
  191. "json": "ds_config_func_bs8_zero2_offload.json",
  192. "cpu_optimizer": True,
  193. }
  194. succ = self.run_test(test_config, 0.01)
  195. self.assertTrue(succ)
  196. def test_mp1_gpu2_load_gpu4_node1_with_zero2(self):
  197. test_config = {
  198. "mp": 1,
  199. "gpus": 2,
  200. "load_gpus": 4,
  201. "nodes": 1,
  202. "bs": 8,
  203. "steps": 1100,
  204. "layers": LAYERS,
  205. "hidden_size": HIDDEN_SIZE,
  206. "seq_length": 256,
  207. "heads": ATTN_HEADS,
  208. "deepspeed": True,
  209. "tag": "ds_zero2",
  210. "zero": True,
  211. "other_args": "",
  212. "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero2",
  213. "checkpoint_interval": 1000,
  214. "json": "ds_config_func_bs8_zero2.json",
  215. }
  216. succ = self.run_test(test_config, 0.01)
  217. self.assertTrue(succ)
  218. def test_mp1_gpu2_load_gpu4_node1_with_zero2_offload(self):
  219. test_config = {
  220. "mp": 1,
  221. "gpus": 2,
  222. "load_gpus": 4,
  223. "nodes": 1,
  224. "bs": 8,
  225. "steps": 1100,
  226. "layers": LAYERS,
  227. "hidden_size": HIDDEN_SIZE,
  228. "seq_length": 256,
  229. "heads": ATTN_HEADS,
  230. "deepspeed": True,
  231. "tag": "ds_zero2_offload",
  232. "zero": True,
  233. "other_args": "",
  234. "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero2_offload",
  235. "checkpoint_interval": 1000,
  236. "json": "ds_config_func_bs8_zero2_offload.json",
  237. "cpu_optimizer": True,
  238. }
  239. succ = self.run_test(test_config, 0.01)
  240. self.assertTrue(succ)
  241. def test_mp2_gpu4_load_gpu2_node1_with_zero1(self):
  242. test_config = {
  243. "mp": 2,
  244. "gpus": 4,
  245. "load_gpus": 2,
  246. "nodes": 1,
  247. "bs": 8,
  248. "steps": 1100,
  249. "layers": LAYERS,
  250. "hidden_size": HIDDEN_SIZE,
  251. "seq_length": 256,
  252. "heads": ATTN_HEADS,
  253. "deepspeed": True,
  254. "tag": "ds_zero1",
  255. "zero": True,
  256. "other_args": "",
  257. "checkpoint_name": "ckpt_mp2_gpu4_gpu2_w_zero1",
  258. "checkpoint_interval": 1000,
  259. "json": "ds_config_func_bs8_zero1.json",
  260. }
  261. succ = self.run_test(test_config, 0.01)
  262. self.assertTrue(succ)
  263. def test_mp2_gpu2_load_gpu4_node1_with_zero1(self):
  264. test_config = {
  265. "mp": 2,
  266. "gpus": 2,
  267. "load_gpus": 4,
  268. "nodes": 1,
  269. "bs": 8,
  270. "steps": 1100,
  271. "layers": LAYERS,
  272. "hidden_size": HIDDEN_SIZE,
  273. "seq_length": 256,
  274. "heads": ATTN_HEADS,
  275. "deepspeed": True,
  276. "tag": "ds_zero1",
  277. "zero": True,
  278. "other_args": "",
  279. "checkpoint_name": "ckpt_mp2_gpu2_gpu4_w_zero1",
  280. "checkpoint_interval": 1000,
  281. "json": "ds_config_func_bs8_zero1.json",
  282. }
  283. succ = self.run_test(test_config, 0.01)
  284. self.assertTrue(succ)
  285. def test_mp2_gpu4_load_gpu2_node1_with_zero2(self):
  286. test_config = {
  287. "mp": 2,
  288. "gpus": 4,
  289. "load_gpus": 2,
  290. "nodes": 1,
  291. "bs": 8,
  292. "steps": 1100,
  293. "layers": LAYERS,
  294. "hidden_size": HIDDEN_SIZE,
  295. "seq_length": 256,
  296. "heads": ATTN_HEADS,
  297. "deepspeed": True,
  298. "tag": "ds_zero2",
  299. "zero": True,
  300. "other_args": "",
  301. "checkpoint_name": "ckpt_mp2_gpu4_gpu2_w_zero2",
  302. "checkpoint_interval": 1000,
  303. "json": "ds_config_func_bs8_zero2.json",
  304. }
  305. succ = self.run_test(test_config, 0.01)
  306. self.assertTrue(succ)
  307. def test_mp2_gpu4_load_gpu2_node1_with_zero2_offload(self):
  308. test_config = {
  309. "mp": 2,
  310. "gpus": 4,
  311. "load_gpus": 2,
  312. "nodes": 1,
  313. "bs": 8,
  314. "steps": 1100,
  315. "layers": LAYERS,
  316. "hidden_size": HIDDEN_SIZE,
  317. "seq_length": 256,
  318. "heads": ATTN_HEADS,
  319. "deepspeed": True,
  320. "tag": "ds_zero2_offload",
  321. "zero": True,
  322. "other_args": "",
  323. "checkpoint_name": "ckpt_mp2_gpu4_gpu2_w_zero2_offload",
  324. "checkpoint_interval": 1000,
  325. "json": "ds_config_func_bs8_zero2_offload.json",
  326. "cpu_optimizer": True,
  327. }
  328. succ = self.run_test(test_config, 0.01)
  329. self.assertTrue(succ)
  330. def test_mp2_gpu2_load_gpu4_node1_with_zero2(self):
  331. test_config = {
  332. "mp": 2,
  333. "gpus": 2,
  334. "load_gpus": 4,
  335. "nodes": 1,
  336. "bs": 8,
  337. "steps": 1100,
  338. "layers": LAYERS,
  339. "hidden_size": HIDDEN_SIZE,
  340. "seq_length": 256,
  341. "heads": ATTN_HEADS,
  342. "deepspeed": True,
  343. "tag": "ds_zero2",
  344. "zero": True,
  345. "other_args": "",
  346. "checkpoint_name": "ckpt_mp2_gpu2_gpu4_w_zero2",
  347. "checkpoint_interval": 1000,
  348. "json": "ds_config_func_bs8_zero2.json",
  349. }
  350. succ = self.run_test(test_config, 0.01)
  351. self.assertTrue(succ)
  352. def test_mp2_gpu2_load_gpu4_node1_with_zero2_offload(self):
  353. test_config = {
  354. "mp": 2,
  355. "gpus": 2,
  356. "load_gpus": 4,
  357. "nodes": 1,
  358. "bs": 8,
  359. "steps": 1100,
  360. "layers": LAYERS,
  361. "hidden_size": HIDDEN_SIZE,
  362. "seq_length": 256,
  363. "heads": ATTN_HEADS,
  364. "deepspeed": True,
  365. "tag": "ds_zero2_offload",
  366. "zero": True,
  367. "other_args": "",
  368. "checkpoint_name": "ckpt_mp2_gpu2_gpu4_w_zero2_offload",
  369. "checkpoint_interval": 1000,
  370. "json": "ds_config_func_bs8_zero2_offload.json",
  371. "cpu_optimizer": True,
  372. }
  373. succ = self.run_test(test_config, 0.01)
  374. self.assertTrue(succ)
  375. def test_mp2_gpu4_node1_without_zero(self):
  376. test_config = {
  377. "mp": 2,
  378. "gpus": 4,
  379. "nodes": 1,
  380. "bs": 8,
  381. "steps": 1100,
  382. "layers": LAYERS,
  383. "hidden_size": HIDDEN_SIZE,
  384. "seq_length": 256,
  385. "heads": ATTN_HEADS,
  386. "deepspeed": True,
  387. "zero": False,
  388. "other_args": "",
  389. "tag": "ds_without_zero",
  390. "checkpoint_name": "ckpt_mp4_gpu16_wo_zero",
  391. "checkpoint_interval": 1000,
  392. "json": "ds_config_func_bs8_no_zero.json",
  393. }
  394. succ = self.run_test(test_config, 0.01)
  395. self.assertTrue(succ)
  396. def gen_name(self, test_config, prefix):
  397. save_dir = "checkpoint_test_logs"
  398. tag = test_config["tag"]
  399. checkpoint_name = test_config["checkpoint_name"]
  400. file_name = f"_{tag}_{checkpoint_name}.log"
  401. return os.path.join(save_dir, prefix + file_name)
  402. def run_test(self, test_config, r_tol):
  403. print("\n")
  404. print("{0}: starting......".format(self.id()))
  405. # Cache save and load gpu counts
  406. save_gpus = test_config["gpus"]
  407. if "load_gpus" in test_config:
  408. load_gpus = test_config["load_gpus"]
  409. del test_config["load_gpus"]
  410. else:
  411. load_gpus = test_config["gpus"]
  412. # save to current directory.
  413. checkpoint_folder = test_config["checkpoint_name"]
  414. checkpoint_interval = test_config["checkpoint_interval"]
  415. checkpoint_name = test_config["checkpoint_name"]
  416. #---------------remove old checkpoint---------------#
  417. try:
  418. cmd = shlex.split(f"rm -rf {checkpoint_name}")
  419. print(f"{self.id()} cmd: {cmd}")
  420. subprocess.run(cmd, check=False, executable='/bin/bash')
  421. except:
  422. print("No old checkpoint")
  423. if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
  424. cpu_optimizer_flag = " --cpu-optimizer"
  425. else:
  426. cpu_optimizer_flag = ""
  427. #-----------------Saving Checkpoint-----------------#
  428. # building checkpoint arguments
  429. test_config[
  430. "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval} {cpu_optimizer_flag}\""
  431. prefix = "gpt2_saving_checkpoint"
  432. # create checkpoint run...
  433. base_file = self.gen_name(test_config, prefix)
  434. # remove previous test log
  435. try:
  436. cmd = shlex.split(f"rm {base_file}")
  437. subprocess.run(cmd, check=False, executable='/bin/bash')
  438. except:
  439. print(f"{self.id()} No old logs")
  440. print("{0}: Run for saving checkpoint".format(self.id()))
  441. self.run_gpt2_test(test_config, base_file)
  442. #-----------------Loading Checkpoint-----------------#
  443. # building checkpoint arguments
  444. test_config["other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
  445. # set checkpoint load iteration
  446. try:
  447. cmd = shlex.split(f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt")
  448. print(f"{self.id()} running cmd: {cmd}")
  449. subprocess.run(cmd, check=False, executable='/bin/bash')
  450. except:
  451. print(f"{self.id()} Failed to update the checkpoint iteration file")
  452. return False
  453. prefix = "gpt2_loading_checkpoint"
  454. # set load gpus
  455. test_config["gpus"] = load_gpus
  456. print("{0}: Second run loading checkpoint and continuing.".format(self.id()))
  457. test_file = self.gen_name(test_config, prefix)
  458. # remove previous test log
  459. try:
  460. cmd = shlex.split(f"rm {test_file}")
  461. subprocess.run(cmd, check=False, executable='/bin/bash')
  462. except:
  463. print(f"{self.id()} no previous logs for")
  464. self.run_gpt2_test(test_config, test_file)
  465. return self.check_parity(base_file, test_file, r_tol)
  466. def has_loss_data(self, file_name):
  467. has_loss = False
  468. if os.path.exists(file_name):
  469. loss = grep_loss_from_file(file_name)
  470. if loss != 0.0:
  471. has_loss = True
  472. return has_loss
  473. def check_parity(self, base_file, test_file, r_tol):
  474. base_loss = grep_loss_from_file(base_file)
  475. test_loss = grep_loss_from_file(test_file)
  476. print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
  477. if base_loss == 0.0 or test_loss == 0.0:
  478. return False
  479. if abs((base_loss - test_loss) / base_loss) > r_tol:
  480. return False
  481. return True
  482. def checkpoint_suite():
  483. suite = unittest.TestSuite()
  484. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero1'))
  485. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2'))
  486. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2_offload'))
  487. # Shrink DP
  488. suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1'))
  489. suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2'))
  490. suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
  491. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1'))
  492. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2'))
  493. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
  494. # Expand DP
  495. suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1'))
  496. suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2'))
  497. suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
  498. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1'))
  499. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2'))
  500. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
  501. suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero'))
  502. return suite
  503. if __name__ == '__main__':
  504. runner = unittest.TextTestRunner(failfast=True)
  505. runner.run(checkpoint_suite())