test_config.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. # A test on its own
  2. import torch
  3. import pytest
  4. import json
  5. import argparse
  6. from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
  7. from .common import distributed_test, get_test_path
  8. from .simple_model import SimpleModel, create_config_from_dict, random_dataloader
  9. import deepspeed.comm as dist
  10. # A test on its own
  11. import deepspeed
  12. from deepspeed.runtime.config import DeepSpeedConfig, get_bfloat16_enabled
  13. def test_cuda():
  14. assert (torch.cuda.is_available())
  15. def test_check_version():
  16. assert hasattr(deepspeed, "__git_hash__")
  17. assert hasattr(deepspeed, "__git_branch__")
  18. assert hasattr(deepspeed, "__version__")
  19. assert hasattr(deepspeed, "__version_major__")
  20. assert hasattr(deepspeed, "__version_minor__")
  21. assert hasattr(deepspeed, "__version_patch__")
  22. def _run_batch_config(ds_config, train_batch=None, micro_batch=None, gas=None):
  23. ds_config.train_batch_size = train_batch
  24. ds_config.train_micro_batch_size_per_gpu = micro_batch
  25. ds_config.gradient_accumulation_steps = gas
  26. success = True
  27. try:
  28. ds_config._configure_train_batch_size()
  29. except AssertionError:
  30. success = False
  31. return success
  32. def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
  33. if not success:
  34. assert not status
  35. print("Failed but All is well")
  36. return
  37. assert ds_config.train_batch_size == batch
  38. assert ds_config.train_micro_batch_size_per_gpu == micro_batch
  39. assert ds_config.gradient_accumulation_steps == gas
  40. print("All is well")
  41. #Tests different batch config provided in deepspeed json file
  42. @pytest.mark.parametrize('num_ranks,batch,micro_batch,gas,success',
  43. [(2,32,16,1,True),
  44. (2,32,8,2,True),
  45. (2,33,17,2,False),
  46. (2,32,18,1,False)]) # yapf: disable
  47. def test_batch_config(num_ranks, batch, micro_batch, gas, success):
  48. @distributed_test(world_size=2)
  49. def _test_batch_config(num_ranks, batch, micro_batch, gas, success):
  50. assert dist.get_world_size() == num_ranks, \
  51. 'The test assumes a world size of f{num_ranks}'
  52. ds_batch_config = get_test_path('ds_batch_config.json')
  53. ds_config = DeepSpeedConfig(ds_batch_config)
  54. #test cases when all parameters are provided
  55. status = _run_batch_config(ds_config,
  56. train_batch=batch,
  57. micro_batch=micro_batch,
  58. gas=gas)
  59. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  60. #test cases when two out of three parameters are provided
  61. status = _run_batch_config(ds_config, train_batch=batch, micro_batch=micro_batch)
  62. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  63. if success:
  64. #when gas is provided with one more parameter
  65. status = _run_batch_config(ds_config, train_batch=batch, gas=gas)
  66. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  67. status = _run_batch_config(ds_config, micro_batch=micro_batch, gas=gas)
  68. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  69. #test the case when only micro_batch or train_batch is provided
  70. if gas == 1:
  71. status = _run_batch_config(ds_config, micro_batch=micro_batch)
  72. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  73. status = _run_batch_config(ds_config, train_batch=batch)
  74. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  75. else:
  76. #when only gas is provided
  77. status = _run_batch_config(ds_config, gas=gas)
  78. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  79. #when gas is provided with something else and gas does not divide batch
  80. if gas != 1:
  81. status = _run_batch_config(ds_config, train_batch=batch, gas=gas)
  82. _batch_assert(status, ds_config, batch, micro_batch, gas, success)
  83. """Run batch config test """
  84. _test_batch_config(num_ranks, batch, micro_batch, gas, success)
  85. def test_temp_config_json(tmpdir):
  86. config_dict = {
  87. "train_batch_size": 1,
  88. }
  89. config_path = create_config_from_dict(tmpdir, config_dict)
  90. config_json = json.load(open(config_path, 'r'))
  91. assert 'train_batch_size' in config_json
  92. @pytest.mark.parametrize("gather_weights_key",
  93. [
  94. "stage3_gather_16bit_weights_on_model_save",
  95. "stage3_gather_fp16_weights_on_model_save"
  96. ])
  97. def test_gather_16bit_params_on_model_save(gather_weights_key):
  98. config_dict = {
  99. gather_weights_key: True,
  100. }
  101. config = DeepSpeedZeroConfig(**config_dict)
  102. assert config.gather_16bit_weights_on_model_save == True
  103. @pytest.mark.parametrize("bf16_key", ["bf16", "bfloat16"])
  104. def test_get_bfloat16_enabled(bf16_key):
  105. cfg = {
  106. bf16_key: {
  107. "enabled": True,
  108. },
  109. }
  110. assert get_bfloat16_enabled(cfg) == True
  111. def test_deprecated_deepscale_config(tmpdir):
  112. config_dict = {
  113. "train_batch_size": 1,
  114. "optimizer": {
  115. "type": "Adam",
  116. "params": {
  117. "lr": 0.00015
  118. }
  119. },
  120. "fp16": {
  121. "enabled": True
  122. }
  123. }
  124. config_path = create_config_from_dict(tmpdir, config_dict)
  125. parser = argparse.ArgumentParser()
  126. args = parser.parse_args(args='')
  127. args.deepscale_config = config_path
  128. args.local_rank = 0
  129. hidden_dim = 10
  130. model = SimpleModel(hidden_dim)
  131. @distributed_test(world_size=[1])
  132. def _test_deprecated_deepscale_config(args, model, hidden_dim):
  133. model, _, _,_ = deepspeed.initialize(args=args,
  134. model=model,
  135. model_parameters=model.parameters())
  136. data_loader = random_dataloader(model=model,
  137. total_samples=5,
  138. hidden_dim=hidden_dim,
  139. device=model.device)
  140. for n, batch in enumerate(data_loader):
  141. loss = model(batch[0], batch[1])
  142. model.backward(loss)
  143. model.step()
  144. _test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
  145. def test_dist_init_true(tmpdir):
  146. config_dict = {
  147. "train_batch_size": 1,
  148. "optimizer": {
  149. "type": "Adam",
  150. "params": {
  151. "lr": 0.00015
  152. }
  153. },
  154. "fp16": {
  155. "enabled": True
  156. }
  157. }
  158. config_path = create_config_from_dict(tmpdir, config_dict)
  159. parser = argparse.ArgumentParser()
  160. args = parser.parse_args(args='')
  161. args.deepscale_config = config_path
  162. args.local_rank = 0
  163. hidden_dim = 10
  164. model = SimpleModel(hidden_dim)
  165. @distributed_test(world_size=[1])
  166. def _test_dist_init_true(args, model, hidden_dim):
  167. model, _, _,_ = deepspeed.initialize(args=args,
  168. model=model,
  169. model_parameters=model.parameters(),
  170. dist_init_required=True)
  171. data_loader = random_dataloader(model=model,
  172. total_samples=5,
  173. hidden_dim=hidden_dim,
  174. device=model.device)
  175. for n, batch in enumerate(data_loader):
  176. loss = model(batch[0], batch[1])
  177. model.backward(loss)
  178. model.step()
  179. _test_dist_init_true(args=args, model=model, hidden_dim=hidden_dim)
  180. def test_init_no_optimizer(tmpdir):
  181. config_dict = {"train_batch_size": 1, "fp16": {"enabled": True}}
  182. config_path = create_config_from_dict(tmpdir, config_dict)
  183. @distributed_test(world_size=1)
  184. def _helper():
  185. parser = argparse.ArgumentParser()
  186. args = parser.parse_args(args='')
  187. args.deepscale_config = config_path
  188. args.local_rank = 0
  189. hidden_dim = 10
  190. model = SimpleModel(hidden_dim=hidden_dim)
  191. model, _, _, _ = deepspeed.initialize(args=args, model=model)
  192. data_loader = random_dataloader(model=model,
  193. total_samples=5,
  194. hidden_dim=hidden_dim,
  195. device=model.device)
  196. for n, batch in enumerate(data_loader):
  197. loss = model(batch[0], batch[1])
  198. with pytest.raises(AssertionError):
  199. model.backward(loss)
  200. with pytest.raises(AssertionError):
  201. model.step()
  202. _helper()
  203. def test_none_args(tmpdir):
  204. config = {
  205. "train_batch_size": 1,
  206. "optimizer": {
  207. "type": "Adam",
  208. "params": {
  209. "lr": 0.00015
  210. }
  211. },
  212. "fp16": {
  213. "enabled": True
  214. }
  215. }
  216. @distributed_test(world_size=1)
  217. def _helper():
  218. model = SimpleModel(hidden_dim=10)
  219. model, _, _, _ = deepspeed.initialize(args=None, model=model, config=config)
  220. data_loader = random_dataloader(model=model,
  221. total_samples=5,
  222. hidden_dim=10,
  223. device=model.device)
  224. for n, batch in enumerate(data_loader):
  225. loss = model(batch[0], batch[1])
  226. _helper()
  227. def test_no_args(tmpdir):
  228. config = {
  229. "train_batch_size": 1,
  230. "optimizer": {
  231. "type": "Adam",
  232. "params": {
  233. "lr": 0.00015
  234. }
  235. },
  236. "fp16": {
  237. "enabled": True
  238. }
  239. }
  240. @distributed_test(world_size=1)
  241. def _helper():
  242. model = SimpleModel(hidden_dim=10)
  243. model, _, _, _ = deepspeed.initialize(model=model, config=config)
  244. data_loader = random_dataloader(model=model,
  245. total_samples=5,
  246. hidden_dim=10,
  247. device=model.device)
  248. for n, batch in enumerate(data_loader):
  249. loss = model(batch[0], batch[1])
  250. _helper()
  251. def test_no_model(tmpdir):
  252. config = {
  253. "train_batch_size": 1,
  254. "optimizer": {
  255. "type": "Adam",
  256. "params": {
  257. "lr": 0.00015
  258. }
  259. },
  260. "fp16": {
  261. "enabled": True
  262. }
  263. }
  264. @distributed_test(world_size=1)
  265. def _helper():
  266. model = SimpleModel(hidden_dim=10)
  267. with pytest.raises(AssertionError):
  268. model, _, _, _ = deepspeed.initialize(model=None, config=config)
  269. with pytest.raises(AssertionError):
  270. model, _, _, _ = deepspeed.initialize(model, config=config)