test_bf16.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import torch
  5. import deepspeed
  6. import pytest
  7. from deepspeed.ops.adam import FusedAdam
  8. from unit.common import DistributedTest
  9. from deepspeed.ops.op_builder import CPUAdamBuilder
  10. from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader
  11. from unit.util import bf16_required_version_check
  12. from deepspeed import comm as dist
  13. class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest):
  14. world_size = 1
  15. def test(self, zero_stage=2, use_cpu_offload=False):
  16. if not bf16_required_version_check():
  17. pytest.skip(
  18. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  19. )
  20. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  21. pytest.skip("cpu-adam is not compatible")
  22. config_dict = {
  23. "train_micro_batch_size_per_gpu": 1,
  24. "steps_per_print": 1,
  25. "optimizer": {
  26. "type": "Adam",
  27. "params": {
  28. "lr": 0.00015
  29. }
  30. },
  31. "scheduler": {
  32. "type": "OneCycle",
  33. "params": {
  34. "cycle_first_step_size": 16000,
  35. "cycle_first_stair_count": 8000,
  36. "decay_step_size": 16000,
  37. "cycle_min_lr": 1e-06,
  38. "cycle_max_lr": 3e-05,
  39. "decay_lr_rate": 1e-07,
  40. "cycle_min_mom": 0.85,
  41. "cycle_max_mom": 0.99,
  42. "decay_mom_rate": 0.0
  43. }
  44. },
  45. "fp16": {
  46. "enabled": False
  47. },
  48. "bf16": {
  49. "enabled": True
  50. },
  51. "zero_optimization": {
  52. "stage": zero_stage,
  53. "cpu_offload": use_cpu_offload
  54. }
  55. }
  56. hidden_dim = 10
  57. model = SimpleModel(hidden_dim)
  58. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  59. data_loader = random_dataloader(model=model,
  60. total_samples=50,
  61. hidden_dim=hidden_dim,
  62. device=model.device,
  63. dtype=torch.bfloat16)
  64. for n, batch in enumerate(data_loader):
  65. loss = model(batch[0], batch[1])
  66. model.backward(loss)
  67. model.step()
  68. class TestZeroAllowUntestedOptimizer(DistributedTest):
  69. world_size = 1
  70. def test(self, zero_stage=2, use_cpu_offload=False):
  71. if not bf16_required_version_check():
  72. pytest.skip(
  73. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  74. )
  75. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  76. pytest.skip("cpu-adam is not compatible")
  77. config_dict = {
  78. "train_micro_batch_size_per_gpu": 4,
  79. "steps_per_print": 1,
  80. "fp16": {
  81. "enabled": False,
  82. },
  83. "bf16": {
  84. "enabled": True
  85. },
  86. "zero_optimization": {
  87. "stage": zero_stage,
  88. "cpu_offload": use_cpu_offload
  89. },
  90. "zero_allow_untested_optimizer": False
  91. }
  92. hidden_dim = 10
  93. model = SimpleModel(hidden_dim)
  94. optimizer = SimpleOptimizer(model.parameters())
  95. with pytest.raises(AssertionError):
  96. model, optim, _, _ = deepspeed.initialize(config=config_dict,
  97. model=model,
  98. optimizer=optimizer,
  99. model_parameters=model.parameters())
  100. class TestZeroEmptyPartition(DistributedTest):
  101. world_size = 3
  102. def test(self, zero_stage=2, use_cpu_offload=False):
  103. if not bf16_required_version_check():
  104. pytest.skip(
  105. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  106. )
  107. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  108. pytest.skip("cpu-adam is not compatible")
  109. if zero_stage == 3:
  110. pytest.skip("skip for now")
  111. config_dict = {
  112. "train_micro_batch_size_per_gpu": 1,
  113. "gradient_accumulation_steps": 1,
  114. "fp16": {
  115. "enabled": False
  116. },
  117. "bf16": {
  118. "enabled": True
  119. },
  120. "optimizer": {
  121. "type": "Adam",
  122. "params": {
  123. "lr": 0.00015
  124. }
  125. },
  126. "zero_optimization": {
  127. "stage": zero_stage,
  128. "cpu_offload": use_cpu_offload,
  129. "reduce_bucket_size": 100,
  130. "allgather_bucket_size": 100
  131. }
  132. }
  133. hidden_dim = 1
  134. model = SimpleModel(hidden_dim)
  135. # Ensure model has 2 parameters, to cause empty partition with DP=3
  136. assert len(list(model.parameters())) == 2
  137. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  138. # Now make sure things work..
  139. data_loader = random_dataloader(model=model,
  140. total_samples=1,
  141. hidden_dim=hidden_dim,
  142. device=model.device,
  143. dtype=torch.bfloat16)
  144. for n, batch in enumerate(data_loader):
  145. loss = model(batch[0], batch[1])
  146. model.backward(loss)
  147. model.step()
  148. @pytest.mark.parametrize("optimizer_constructor", [torch.optim.Adam, FusedAdam])
  149. class TestZeroSupportedClientOptimizer(DistributedTest):
  150. world_size = 1
  151. def test(self, optimizer_constructor, zero_stage=2):
  152. if not bf16_required_version_check():
  153. pytest.skip(
  154. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  155. )
  156. config_dict = {
  157. "train_micro_batch_size_per_gpu": 2,
  158. "steps_per_print": 1,
  159. "fp16": {
  160. "enabled": False
  161. },
  162. "bf16": {
  163. "enabled": True
  164. },
  165. "zero_optimization": {
  166. "stage": zero_stage
  167. }
  168. }
  169. hidden_dim = 10
  170. model = SimpleModel(hidden_dim)
  171. client_optimizer = optimizer_constructor(params=model.parameters())
  172. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
  173. class TestZero2ReduceScatterOff(DistributedTest):
  174. world_size = 2
  175. def test(self):
  176. if not bf16_required_version_check():
  177. pytest.skip(
  178. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  179. )
  180. config_dict = {
  181. "train_micro_batch_size_per_gpu": 2,
  182. "steps_per_print": 1,
  183. "optimizer": {
  184. "type": "Adam",
  185. "params": {
  186. "lr": 0.00015
  187. }
  188. },
  189. "gradient_clipping": 1.0,
  190. "zero_optimization": {
  191. "stage": 2,
  192. "contiguous_gradients": True,
  193. "allgather_bucket_size": 2000000000,
  194. "reduce_bucket_size": 200000000,
  195. "overlap_comm": False,
  196. "reduce_scatter": False
  197. },
  198. "fp16": {
  199. "enabled": False
  200. },
  201. "bf16": {
  202. "enabled": True
  203. }
  204. }
  205. hidden_dim = 10
  206. model = SimpleModel(hidden_dim)
  207. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
  208. data_loader = random_dataloader(model=model,
  209. total_samples=50,
  210. hidden_dim=hidden_dim,
  211. device=model.device,
  212. dtype=torch.bfloat16)
  213. for n, batch in enumerate(data_loader):
  214. loss = model(batch[0], batch[1])
  215. model.backward(loss)
  216. model.step()
  217. class TestZeroEmptyGrad(DistributedTest):
  218. world_size = 1
  219. def test(self, stage=2):
  220. if not bf16_required_version_check():
  221. pytest.skip(
  222. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  223. )
  224. config_dict = {
  225. "train_micro_batch_size_per_gpu": 1,
  226. "steps_per_print": 1,
  227. "fp16": {
  228. "enabled": False
  229. },
  230. "bf16": {
  231. "enabled": True
  232. },
  233. "zero_optimization": {
  234. "stage": stage
  235. }
  236. }
  237. hidden_dim = 10
  238. model = SimpleModel(hidden_dim)
  239. optimizer = torch.optim.Adam(model.parameters())
  240. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
  241. data_loader = random_dataloader(model=model,
  242. total_samples=50,
  243. hidden_dim=hidden_dim,
  244. device=model.device,
  245. dtype=torch.bfloat16)
  246. for n, batch in enumerate(data_loader):
  247. loss = model(batch[0], batch[1])
  248. model.backward(loss)
  249. model.step()
  250. @pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bfp16", "fp32"])
  251. @pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bfp16", "default"])
  252. class TestZeroDtypeCocktail(DistributedTest):
  253. world_size = 2
  254. def test(self, comp_type, comm_type):
  255. if comp_type == torch.bfloat16 or comm_type == torch.bfloat16:
  256. if not bf16_required_version_check():
  257. pytest.skip(
  258. " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
  259. )
  260. type_str = {torch.float16: "fp16", torch.bfloat16: "bfp16"}
  261. config_dict = {
  262. "train_micro_batch_size_per_gpu": 2,
  263. "steps_per_print": 1,
  264. "fp16": {
  265. "enabled": comp_type == torch.float16
  266. },
  267. "bf16": {
  268. "enabled": comp_type == torch.bfloat16
  269. },
  270. "zero_optimization": {
  271. "stage": 2
  272. },
  273. }
  274. if comm_type is not None:
  275. config_dict["communication_data_type"] = type_str[comm_type]
  276. else:
  277. comm_type = comp_type
  278. hidden_dim = 10
  279. model = SimpleModel(hidden_dim)
  280. optimizer = torch.optim.Adam(model.parameters())
  281. model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
  282. data_loader = random_dataloader(model=model,
  283. total_samples=2,
  284. hidden_dim=hidden_dim,
  285. device=model.device,
  286. dtype=comp_type)
  287. def custom_reduce(tensor, dst, op=dist.ReduceOp.SUM, group=None, async_op=False):
  288. assert tensor.dtype == comm_type
  289. return orig_torch_reduce(tensor, dst, op, group, async_op)
  290. orig_torch_reduce = dist.reduce
  291. dist.reduce = custom_reduce
  292. for n, batch in enumerate(data_loader):
  293. loss = model(batch[0], batch[1])
  294. model.backward(loss)
  295. model.step()
  296. dist.reduce = orig_torch_reduce