test_runtime_utils.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import torch
  5. from torch._utils import _flatten_dense_tensors
  6. import deepspeed.comm as dist
  7. import pytest
  8. import deepspeed.runtime.utils as ds_utils
  9. import deepspeed.utils.groups as groups
  10. from deepspeed.accelerator import get_accelerator
  11. from unit.common import DistributedTest
  12. def test_call_to_str():
  13. c2s = ds_utils.call_to_str
  14. assert c2s('int') == 'int()'
  15. assert c2s('int', 3) == 'int(3)'
  16. assert c2s('int', 3, 'jeff') == 'int(3, \'jeff\')'
  17. assert c2s('hello', val=3) == 'hello(val=3)'
  18. assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
  19. class TestClibGradNorm(DistributedTest):
  20. world_size = 2
  21. def test(self):
  22. param1 = torch.nn.Parameter(torch.Tensor([0]))
  23. param1.grad = torch.Tensor([1])
  24. param2 = torch.nn.Parameter(torch.Tensor([0]))
  25. param2.grad = torch.Tensor([dist.get_rank() + 1])
  26. # param2 is now MoE parameter
  27. param2.allreduce = False
  28. parameters = [param1, param2]
  29. groups._create_expert_and_data_parallel(2)
  30. norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1)
  31. norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank()))
  32. world_size = dist.get_world_size()
  33. gathered_norm = [torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)]
  34. dist.all_gather(gathered_norm, norm)
  35. assert gathered_norm[0] == gathered_norm[1], "norm at rank 0 does not match the norm at rank 1"
  36. @pytest.mark.parametrize("check_using_norm", [(False), (True)])
  37. class TestCheckOverflow(DistributedTest):
  38. world_size = 2
  39. def test(self, check_using_norm):
  40. groups._create_expert_and_data_parallel(2)
  41. param1 = torch.nn.Parameter(torch.Tensor([0]))
  42. param1.grad = torch.Tensor([1])
  43. param2 = torch.nn.Parameter(torch.Tensor([0]))
  44. if dist.get_rank() == 0:
  45. param2.grad = torch.Tensor([1])
  46. else:
  47. param2.grad = torch.Tensor([float("inf")])
  48. param2.allreduce = False
  49. # param2 is now MoE parameter
  50. parameters = [param1, param2]
  51. if check_using_norm:
  52. grads_group_flat = [_flatten_dense_tensors([p.grad for p in parameters])]
  53. norm = ds_utils.get_weight_norm(grads_group_flat)
  54. overflow_checker = ds_utils.CheckOverflow([parameters])
  55. overflow = overflow_checker.check_using_norm([norm], reduce_overflow=False)
  56. else:
  57. overflow_checker = ds_utils.CheckOverflow([parameters])
  58. overflow = overflow_checker.check()
  59. assert overflow