test_cpu_adam.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import torch
  5. import numpy as np
  6. import pytest
  7. from cpuinfo import get_cpu_info
  8. import deepspeed
  9. from deepspeed.accelerator import get_accelerator
  10. from deepspeed.ops.adam import FusedAdam
  11. from deepspeed.ops.op_builder import CPUAdamBuilder
  12. from unit.common import DistributedTest
  13. if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  14. pytest.skip("cpu-adam is not compatible", allow_module_level=True)
  15. pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
  16. def check_equal(first, second, atol=1e-2, verbose=False):
  17. x = first.detach().numpy()
  18. y = second.detach().numpy()
  19. print("ATOL", atol)
  20. if verbose:
  21. print("x = {}".format(x.flatten()))
  22. print("y = {}".format(y.flatten()))
  23. print('-' * 80)
  24. np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
  25. def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
  26. for i in range(10):
  27. param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
  28. param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype)
  29. optimizer1.step()
  30. optimizer2.step()
  31. tolerance = param1.float().norm().detach().numpy() * 1e-2
  32. check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
  33. @pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
  34. @pytest.mark.parametrize('model_size',
  35. [
  36. (64),
  37. (22),
  38. #(55),
  39. (128),
  40. (1024),
  41. (1048576),
  42. ]) # yapf: disable
  43. class TestCPUAdam(DistributedTest):
  44. world_size = 1
  45. reuse_dist_env = True
  46. requires_cuda_env = False
  47. if not get_accelerator().is_available():
  48. init_distributed = False
  49. set_dist_env = False
  50. @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
  51. def test_fused_adam_equal(self, dtype, model_size):
  52. if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
  53. pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
  54. from deepspeed.ops.adam import DeepSpeedCPUAdam
  55. cpu_data = torch.randn(model_size, device='cpu').to(dtype)
  56. cpu_param = torch.nn.Parameter(cpu_data)
  57. cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name()))
  58. # tolerance = cpu_param.float().norm().detach().numpy() * 1e-2
  59. # check_equal(cpu_param.float().norm(),
  60. # cuda_param.float().cpu().norm(),
  61. # atol=tolerance,
  62. # verbose=True)
  63. cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
  64. cuda_optimizer = FusedAdam([cuda_param])
  65. _compare_optimizers(model_size=model_size,
  66. param1=cpu_param,
  67. optimizer1=cpu_optimizer,
  68. param2=cuda_param,
  69. optimizer2=cuda_optimizer)
  70. def test_torch_adamw_equal(self, dtype, model_size):
  71. if get_accelerator().is_available():
  72. if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
  73. pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
  74. ref_param_device = get_accelerator().device_name()
  75. else:
  76. if dtype == torch.half:
  77. pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.")
  78. ref_param_device = 'cpu'
  79. from deepspeed.ops.adam import DeepSpeedCPUAdam
  80. cpu_data = torch.randn(model_size, device='cpu').to(dtype)
  81. cpu_param = torch.nn.Parameter(cpu_data)
  82. ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device))
  83. cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
  84. ref_optimizer = torch.optim.AdamW([ref_param])
  85. _compare_optimizers(model_size=model_size,
  86. param1=cpu_param,
  87. optimizer1=cpu_optimizer,
  88. param2=ref_param,
  89. optimizer2=ref_optimizer)
  90. class TestCPUAdamGPUError(DistributedTest):
  91. def test_cpu_adam_gpu_error(self):
  92. model_size = 64
  93. from deepspeed.ops.adam import DeepSpeedCPUAdam
  94. device = get_accelerator().device_name(0) # 'cuda:0' or 'xpu:0'
  95. param = torch.nn.Parameter(torch.randn(model_size, device=device))
  96. optimizer = DeepSpeedCPUAdam([param])
  97. param.grad = torch.randn(model_size, device=device)
  98. with pytest.raises(AssertionError):
  99. optimizer.step()