123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606 |
- # coding=utf-8
- # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
- #
- # Note: please copy webtext data to "Megatron-LM" folder, before running this script.
- import unittest
- import subprocess
- import os
- import time
- import re
- from .test_common import BaseTestCase
- LAYERS = 2
- HIDDEN_SIZE = 128
- ATTN_HEADS = 8
- SEQ_LEN = 64
- MASTER_PORT = 29700
- def grep_loss_from_file(file_name):
- loss = 0.0
- print(f'grepping {file_name}')
- with open(file_name, 'r') as f:
- lines = f.readlines()
- line_filter = "validation loss at the end of training for test data | LM loss:"
- match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
- for line in lines:
- if line_filter in line:
- loss = re.findall(match_number, line)
- loss = float(loss[0])
- if loss == 0.0:
- print("no loss found in file ", file_name)
- return loss
- class GPT2FuncTestCase(BaseTestCase):
- def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
- super(GPT2FuncTestCase, self).__init__(methodName)
- def setUp(self):
- self.save_dir = os.getcwd()
- new_dir = os.path.dirname(__file__)
- if new_dir:
- os.chdir(new_dir)
- def tearDown(self):
- os.chdir(self.save_dir)
- def test_mp1_gpu2_node1_fp16(self):
- test_config = {
- "mp": 1,
- "gpus": 2,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_no_zero.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp1_gpu1_node1_zero1(self):
- test_config = {
- "mp": 1,
- "gpus": 1,
- "nodes": 1,
- "bs": 4,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs4_zero1.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp1_gpu2_node1_zero1(self):
- test_config = {
- "mp": 1,
- "gpus": 2,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero1.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp2_gpu4_node1_zero1(self):
- test_config = {
- "mp": 2,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero1.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp4_gpu4_node1_zero1(self):
- test_config = {
- "mp": 4,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero1.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp1_gpu1_node1_zero2(self):
- test_config = {
- "mp": 1,
- "gpus": 1,
- "nodes": 1,
- "bs": 4,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs4_zero2.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp1_gpu2_node1_zero2(self):
- test_config = {
- "mp": 1,
- "gpus": 2,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp2_gpu4_node1_zero2(self):
- test_config = {
- "mp": 2,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2.json",
- }
- basic_run_config = test_config
- succ = self.run_test(basic_run_config, 0.01)
- self.assertTrue(succ)
- partition_activation_config = test_config
- succ = self.run_partition_activations_test(partition_activation_config, 0.01)
- self.assertTrue(succ)
- def test_mp4_gpu4_node1_zero2(self):
- test_config = {
- "mp": 4,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2.json",
- }
- basic_run_config = test_config
- succ = self.run_test(basic_run_config, 0.01)
- self.assertTrue(succ)
- partition_activation_config = test_config
- succ = self.run_partition_activations_test(partition_activation_config, 0.01)
- self.assertTrue(succ)
- def test_mp1_gpu1_node1_zero2_ds_offload(self):
- test_config = {
- "mp": 1,
- "gpus": 1,
- "nodes": 1,
- "bs": 4,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs4_zero2_offload.json",
- "cpu_optimizer": True,
- }
- succ = self.run_test(test_config, 0.02)
- self.assertTrue(succ)
- def test_mp1_gpu2_node1_zero2_ds_offload(self):
- test_config = {
- "mp": 1,
- "gpus": 2,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2_offload.json",
- "cpu_optimizer": True,
- }
- succ = self.run_test(test_config, 0.02)
- self.assertTrue(succ)
- def test_mp2_gpu4_node1_zero2_gas(self):
- test_config = {
- "mp": 2,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": True,
- "json": "ds_config_func_bs8_zero2_gas3.json",
- "baseline": "ds_config_func_bs8_zero0_gas3.json",
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- succ = self.run_partition_activations_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp2_gpu4_node1_zero2_ds_offload(self):
- test_config = {
- "mp": 2,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2_offload.json",
- "cpu_optimizer": True,
- }
- basic_run_config = test_config
- succ = self.run_test(basic_run_config, 0.02)
- self.assertTrue(succ)
- partition_activation_config = test_config
- succ = self.run_partition_activations_test(partition_activation_config, 0.02)
- self.assertTrue(succ)
- def test_mp4_gpu4_node1_zero2_ds_offload(self):
- test_config = {
- "mp": 4,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2_offload.json",
- "cpu_optimizer": True,
- }
- basic_run_config = test_config
- succ = self.run_test(basic_run_config, 0.02)
- self.assertTrue(succ)
- partition_activation_config = test_config
- succ = self.run_partition_activations_test(partition_activation_config, 0.02)
- self.assertTrue(succ)
- def test_mp1_gpu1_node1_zero2_torch_offload(self):
- test_config = {
- "mp": 1,
- "gpus": 1,
- "nodes": 1,
- "bs": 4,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs4_zero2_offload.json",
- "cpu_optimizer": True,
- "test_torch_offload": True,
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp1_gpu2_node1_zero2_torch_offload(self):
- test_config = {
- "mp": 1,
- "gpus": 2,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2_offload.json",
- "cpu_optimizer": True,
- "test_torch_offload": True,
- }
- succ = self.run_test(test_config, 0.01)
- self.assertTrue(succ)
- def test_mp2_gpu4_node1_zero2_torch_offload(self):
- test_config = {
- "mp": 2,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2_offload.json",
- "cpu_optimizer": True,
- "test_torch_offload": True,
- }
- basic_run_config = test_config
- succ = self.run_test(basic_run_config, 0.01)
- self.assertTrue(succ)
- partition_activation_config = test_config
- succ = self.run_partition_activations_test(partition_activation_config, 0.01)
- self.assertTrue(succ)
- def test_mp4_gpu4_node1_zero2_torch_offload(self):
- test_config = {
- "mp": 4,
- "gpus": 4,
- "nodes": 1,
- "bs": 8,
- "steps": 1000,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_bs8_zero2_offload.json",
- "cpu_optimizer": True,
- "test_torch_offload": True,
- }
- basic_run_config = test_config
- succ = self.run_test(basic_run_config, 0.01)
- self.assertTrue(succ)
- partition_activation_config = test_config
- succ = self.run_partition_activations_test(partition_activation_config, 0.01)
- def test_optimizer_scheduler(self):
- test_config = {
- "mp": 1,
- "gpus": 1,
- "nodes": 1,
- "bs": 4,
- "steps": 20,
- "layers": LAYERS,
- "hidden_size": HIDDEN_SIZE,
- "seq_length": SEQ_LEN,
- "heads": ATTN_HEADS,
- "deepspeed": False,
- "json": "ds_config_func_scheduler.json",
- }
- succ = self.run_test(test_config, 0.01)
- # assure no crash.
- self.assertTrue(True)
- def run_partition_activations_test(self, test_config, r_tol):
- print("\n")
- print("{0}: starting......".format(self.id()))
- baseline_prefix = "gpt2_func_"
- prefix = "gpt2_partition_activation_"
- deepspeed_config = test_config["json"]
- baseline_deepspeed_config = False
- cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
- # baseline run...
- # turnoff deepspeed if baseline deepspeed config
- # is not provided
- if not "baseline" in test_config:
- test_config["deepspeed"] = False
- else:
- test_config["json"] = test_config["baseline"]
- baseline_prefix += test_config["json"][0:-5]
- baseline_deepspeed_config = True
- test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
- base_file = self.gen_output_name(test_config,
- baseline_prefix,
- baseline_config=baseline_deepspeed_config)
- # skip baseline run if it exists.
- if not self.has_loss_data(base_file):
- print("{0}: baseline run.".format(self.id()))
- self.run_gpt2_test(test_config, base_file)
- else:
- print("{0}: baseline exists.".format(self.id()))
- # DeepSpeed run...
- test_config["deepspeed"] = True
- cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
- test_config[
- "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
- test_config["json"] = deepspeed_config
- print("{0}: DeepSpeed run.".format(self.id()))
- test_file = self.gen_output_name(test_config, prefix)
- self.run_gpt2_test(test_config, test_file)
- return self.check_parity(base_file, test_file, r_tol)
- def run_test(self, test_config, r_tol):
- print("\n")
- print("{0}: starting......".format(self.id()))
- prefix = "gpt2_func"
- baseline_prefix = prefix
- deepspeed_config = test_config["json"]
- baseline_deepspeed_config = False
- cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
- # baseline run...
- # turn off deepspeed if a baseline deepspeed config
- # is not provided
- if not "baseline" in test_config:
- test_config["deepspeed"] = False
- else:
- test_config["json"] = test_config["baseline"]
- baseline_prefix = prefix + test_config["json"][0:-5]
- baseline_deepspeed_config = True
- test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
- # baseline run...
- base_file = self.gen_output_name(test_config,
- baseline_prefix,
- baseline_config=baseline_deepspeed_config)
- # skip baseline run if it exists.
- if not self.has_loss_data(base_file):
- print("{0}: baseline run.".format(self.id()))
- self.run_gpt2_test(test_config, base_file)
- else:
- print("{0}: baseline exists.".format(self.id()))
- # DeepSpeed run...
- test_config["deepspeed"] = True
- cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
- test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
- print("{0}: DeepSpeed run.".format(self.id()))
- test_file = self.gen_output_name(test_config, prefix)
- self.run_gpt2_test(test_config, test_file)
- return self.check_parity(base_file, test_file, r_tol)
- def has_loss_data(self, file_name):
- has_loss = False
- if os.path.exists(file_name):
- loss = grep_loss_from_file(file_name)
- if loss != 0.0:
- has_loss = True
- return has_loss
- def check_parity(self, base_file, test_file, r_tol):
- base_loss = grep_loss_from_file(base_file)
- test_loss = grep_loss_from_file(test_file)
- print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
- if base_loss == 0.0 or test_loss == 0.0:
- return False
- if abs((base_loss - test_loss) / base_loss) > r_tol:
- return False
- return True
- def gen_cpu_optimizer_flag(self, test_config, is_baseline):
- if 'cpu_optimizer' in test_config and test_config['cpu_optimizer']:
- cpu_optimizer_flag = "--cpu-optimizer"
- if is_baseline:
- cpu_optimizer_flag += " --cpu_torch_adam"
- return cpu_optimizer_flag
- if 'test_torch_offload' in test_config and test_config['test_torch_offload']:
- cpu_optimizer_flag += " --cpu_torch_adam"
- return cpu_optimizer_flag
- else:
- cpu_optimizer_flag = ""
- return cpu_optimizer_flag
- def suite():
- suite = unittest.TestSuite()
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
- # Baseline = Megatron + Torch.Optim.Adam
- # Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_torch_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_torch_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_torch_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_torch_offload'))
- # Baseline = Megatron + Torch.Optim.Adam
- # Test = Megatron + DeepSpeedAdam + ZeRO-Offload
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_ds_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_ds_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_ds_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_ds_offload'))
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
- suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1'))
- suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero1'))
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2'))
- suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2'))
- suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2'))
- suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2'))
- suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_gas'))
- suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler'))
- return suite
- if __name__ == '__main__':
- runner = unittest.TextTestRunner(failfast=True)
- runner.run(suite())
|