test_elastic.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import pytest
  5. import deepspeed
  6. from unit.common import DistributedTest
  7. from deepspeed.git_version_info import version as ds_version
  8. import os
  9. from unit.simple_model import SimpleModel
  10. from deepspeed.ops.op_builder import FusedAdamBuilder
  11. if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
  12. pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
  13. @pytest.fixture
  14. def ds_config():
  15. config_dict = {
  16. "elasticity": {
  17. "enabled": True,
  18. "max_train_batch_size": 10000,
  19. "micro_batch_sizes": [8, 12, 16, 17],
  20. "min_gpus": 32,
  21. "max_gpus": 1500,
  22. "min_time": 20,
  23. "version": 0.1
  24. }
  25. }
  26. return config_dict
  27. def test_basic_10k(ds_config):
  28. final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
  29. target_deepspeed_version=ds_version)
  30. for gpu_num in valid_gpus:
  31. assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
  32. batch_per_gpu = final_batch_size // gpu_num
  33. found_valid_mbsize = False
  34. for mb in ds_config['elasticity']['micro_batch_sizes']:
  35. if batch_per_gpu % mb == 0:
  36. found_valid_mb = True
  37. break
  38. assert found_valid_mb, "No valid mb found"
  39. assert len(valid_gpus) == 23
  40. assert final_batch_size == 9792
  41. def test_old_version(ds_config):
  42. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  43. final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
  44. target_deepspeed_version="0.2")
  45. def test_disabled(ds_config):
  46. ds_config['elasticity']['enabled'] = False
  47. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  48. final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
  49. target_deepspeed_version=ds_version)
  50. def test_valid_world_size(ds_config):
  51. final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
  52. ds_config=ds_config, target_deepspeed_version=ds_version, world_size=64)
  53. assert mbsize == 17
  54. def test_invalid_world_size(ds_config):
  55. with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
  56. final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
  57. ds_config=ds_config, target_deepspeed_version=ds_version, world_size=128)
  58. def test_future_elastic_version(ds_config):
  59. ds_config['elasticity']['version'] = '0.3'
  60. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  61. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  62. def test_missing_max_batch(ds_config):
  63. del ds_config['elasticity']['max_train_batch_size']
  64. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  65. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  66. def test_missing_micro_batch(ds_config):
  67. del ds_config['elasticity']['micro_batch_sizes']
  68. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  69. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  70. def test_empty_config():
  71. ds_config = {"elasticity": {"enabled": True}}
  72. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  73. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  74. def test_model_parallel_v1_invalid(ds_config):
  75. ds_config["elasticity"]["model_parallel_size"] = 4
  76. ds_config["elasticity"]["num_gpus_per_node"] = 8
  77. ds_config["elasticity"]["version"] = 0.1
  78. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  79. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  80. def test_model_parallel_v2_invalid(ds_config):
  81. ds_config["elasticity"]["model_parallel_size"] = 16
  82. ds_config["elasticity"]["num_gpus_per_node"] = 8
  83. ds_config["elasticity"]["version"] = 0.2
  84. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  85. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
  86. target_deepspeed_version=ds_version,
  87. world_size=16)
  88. def test_model_parallel_v2_valid(ds_config):
  89. ds_config["elasticity"]["model_parallel_size"] = 4
  90. ds_config["elasticity"]["num_gpus_per_node"] = 8
  91. ds_config["elasticity"]["version"] = 0.2
  92. os.environ["WORLD_SIZE"] = str(16)
  93. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  94. os.environ.pop("WORLD_SIZE")
  95. @pytest.mark.parametrize('key, value', [('micro_batch_sizes', [1, 4, -1, 2, -10]), ('min_gpus', -1), ('max_gpus', -1),
  96. ('micro_batch_sizes', 5), ('micro_batch_sizes', ['a', None, 0.5]),
  97. ('micro_batch_sizes', [2, 0.5, 4])])
  98. def test_invalid_config_values(key, value, ds_config):
  99. ds_config['elasticity'][key] = value
  100. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  101. deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
  102. def test_proper_mbsz(ds_config):
  103. ds_config["elasticity"]["max_train_batch_size"] = 32
  104. ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
  105. ds_config["elasticity"]["min_gpus"] = 1
  106. final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
  107. ds_config=ds_config, target_deepspeed_version=ds_version, world_size=7)
  108. assert mbsize == 3
  109. class TestNonElasticBatchParams(DistributedTest):
  110. world_size = 2
  111. def test(self):
  112. config_dict = {
  113. "train_batch_size": 2,
  114. "steps_per_print": 1,
  115. "optimizer": {
  116. "type": "Lamb",
  117. "params": {
  118. "lr": 0.00015
  119. }
  120. },
  121. "gradient_clipping": 1.0,
  122. "elasticity": {
  123. "enabled": True,
  124. "max_train_batch_size": 4,
  125. "micro_batch_sizes": [1, 2, 3, 4],
  126. "min_gpus": 1,
  127. "max_gpus": 4,
  128. "min_time": 20,
  129. "version": 0.1
  130. }
  131. }
  132. hidden_dim = 10
  133. model = SimpleModel(hidden_dim, empty_grad=False)
  134. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  135. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  136. class TestNonElasticBatchParamsWithOverride(DistributedTest):
  137. world_size = 2
  138. def test(self):
  139. config_dict = {
  140. "train_batch_size": 2,
  141. "steps_per_print": 1,
  142. "optimizer": {
  143. "type": "Lamb",
  144. "params": {
  145. "lr": 0.00015
  146. }
  147. },
  148. "gradient_clipping": 1.0,
  149. "elasticity": {
  150. "enabled": True,
  151. "max_train_batch_size": 4,
  152. "micro_batch_sizes": [1, 2, 3, 4],
  153. "min_gpus": 1,
  154. "max_gpus": 4,
  155. "min_time": 20,
  156. "version": 0.1,
  157. "ignore_non_elastic_batch_info": True
  158. }
  159. }
  160. hidden_dim = 10
  161. model = SimpleModel(hidden_dim, empty_grad=False)
  162. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  163. class TestElasticConfigChanged(DistributedTest):
  164. world_size = 2
  165. def test(self):
  166. config_dict = {
  167. "train_batch_size": 2,
  168. "steps_per_print": 1,
  169. "optimizer": {
  170. "type": "Lamb",
  171. "params": {
  172. "lr": 0.00015
  173. }
  174. },
  175. "gradient_clipping": 1.0,
  176. "elasticity": {
  177. "enabled": True,
  178. "max_train_batch_size": 4,
  179. "micro_batch_sizes": [1, 2, 3, 4],
  180. "min_gpus": 1,
  181. "max_gpus": 4,
  182. "min_time": 20,
  183. "version": 0.1,
  184. "ignore_non_elastic_batch_info": True
  185. }
  186. }
  187. import json, os
  188. scheduler_elastic_config = config_dict.copy()
  189. scheduler_elastic_config["elasticity"]["max_train_batch_size"] = 27
  190. os.environ['DEEPSPEED_ELASTICITY_CONFIG'] = json.dumps(scheduler_elastic_config)
  191. hidden_dim = 10
  192. model = SimpleModel(hidden_dim, empty_grad=False)
  193. with pytest.raises(deepspeed.elasticity.config.ElasticityError):
  194. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())