BingBertSquad_run_func_test.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. # coding=utf-8
  2. # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
  3. #
  4. # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
  5. import unittest
  6. import subprocess
  7. import os
  8. import time
  9. import re
  10. from .BingBertSquad_test_common import BaseTestCase
  11. def grep_loss_from_file(file_name):
  12. loss = 0.0
  13. with open(file_name, 'r') as f:
  14. lines = f.readlines()
  15. line_filter = "bert_squad_progress: step="
  16. match_number = re.compile('loss=([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
  17. for line in lines:
  18. if line_filter in line:
  19. loss = re.findall(match_number, line)
  20. loss = float(loss[0])
  21. if loss == 0.0:
  22. print("no loss found in file ", file_name)
  23. return loss
  24. class BingBertSquadFuncTestCase(BaseTestCase):
  25. def __init__(self, methodName="DeepSpeed function test on BingBertSquad model"):
  26. super(BingBertSquadFuncTestCase, self).__init__(methodName)
  27. def setUp(self):
  28. self.save_dir = os.getcwd()
  29. new_dir = os.path.dirname(__file__)
  30. if new_dir:
  31. os.chdir(new_dir)
  32. def tearDown(self):
  33. os.chdir(self.save_dir)
  34. def test_gpu4_fp16(self):
  35. test_config = {
  36. "gpus": 4,
  37. "deepspeed": False,
  38. "json": "deepspeed_bsz24_fp16_config.json",
  39. "max_steps": 8,
  40. "max_epoch_steps": 4,
  41. "other_args": "--fp16 --print_steps 1"
  42. }
  43. succ = self.run_test(test_config, 0.01)
  44. self.assertTrue(succ)
  45. def test_gpu4_fp16_zero2(self):
  46. test_config = {
  47. "gpus": 4,
  48. "deepspeed": False,
  49. "json": "deepspeed_bsz24_fp16_zero2_config.json",
  50. "max_steps": 8,
  51. "max_epoch_steps": 4,
  52. "other_args": "--fp16 --print_steps 1"
  53. }
  54. succ = self.run_test(test_config, 0.01)
  55. self.assertTrue(succ)
  56. def test_gpu1_fp16(self):
  57. test_config = {
  58. "gpus": 1,
  59. "deepspeed": False,
  60. "json": "deepspeed_bsz24_fp16_config.json",
  61. "max_steps": 8,
  62. "max_epoch_steps": 4,
  63. "other_args": "--fp16 --print_steps 1"
  64. }
  65. succ = self.run_test(test_config, 0.01)
  66. self.assertTrue(succ)
  67. def test_gpu4_fp32(self):
  68. test_config = {
  69. "gpus": 4,
  70. "deepspeed": False,
  71. "json": "deepspeed_bsz24_fp32_config.json",
  72. "max_steps": 8,
  73. "max_epoch_steps": 4,
  74. "other_args": "--print_steps 1"
  75. }
  76. succ = self.run_test(test_config, 0.01)
  77. self.assertTrue(succ)
  78. def test_gpu1_fp32(self):
  79. test_config = {
  80. "gpus": 1,
  81. "deepspeed": False,
  82. "json": "deepspeed_bsz24_fp32_config.json",
  83. "max_steps": 8,
  84. "max_epoch_steps": 4,
  85. "other_args": "--print_steps 1"
  86. }
  87. succ = self.run_test(test_config, 0.01)
  88. self.assertTrue(succ)
  89. def run_test(self, test_config, r_tol):
  90. print("\n")
  91. print("{0}: starting......".format(self.id()))
  92. prefix = "BingBertSquad_func"
  93. test_config['other_args'] += f" --max_steps {test_config['max_steps']}"
  94. test_config[
  95. 'other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"
  96. # baseline run...
  97. test_config["deepspeed"] = False
  98. base_file = self.gen_output_name(test_config, prefix)
  99. # skip baseline run if it exists.
  100. if not self.has_loss_data(base_file):
  101. print("{0}: baseline run.".format(self.id()))
  102. self.run_BingBertSquad_test(test_config, base_file)
  103. else:
  104. print("{0}: baseline exists.".format(self.id()))
  105. # DeepSpeed run...
  106. test_config["deepspeed"] = True
  107. print("{0}: DeepSpeed run.".format(self.id()))
  108. test_file = self.gen_output_name(test_config, prefix)
  109. self.run_BingBertSquad_test(test_config, test_file)
  110. return self.check_parity(base_file, test_file, r_tol)
  111. def has_loss_data(self, file_name):
  112. has_loss = False
  113. if os.path.exists(file_name):
  114. loss = grep_loss_from_file(file_name)
  115. if loss != 0.0:
  116. has_loss = True
  117. return has_loss
  118. def check_parity(self, base_file, test_file, r_tol):
  119. base_loss = grep_loss_from_file(base_file)
  120. test_loss = grep_loss_from_file(test_file)
  121. print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
  122. if base_loss == 0.0 or test_loss == 0.0:
  123. return False
  124. if abs((base_loss - test_loss) / base_loss) > r_tol:
  125. return False
  126. return True
  127. def suite():
  128. suite = unittest.TestSuite()
  129. suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp16'))
  130. suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp16_zero2'))
  131. suite.addTest(BingBertSquadFuncTestCase('test_gpu1_fp16'))
  132. suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp32'))
  133. suite.addTest(BingBertSquadFuncTestCase('test_gpu1_fp32'))
  134. return suite
  135. if __name__ == '__main__':
  136. runner = unittest.TextTestRunner(failfast=True)
  137. runner.run(suite())