fused_lamb_cuda.cpp 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. /* Copyright 2019 The Microsoft DeepSpeed Team */
  2. #include <torch/extension.h>
  3. // CUDA forward declaration
  4. void fused_lamb_cuda(at::Tensor& p,
  5. at::Tensor& p_copy,
  6. at::Tensor& m,
  7. at::Tensor& v,
  8. at::Tensor& g,
  9. float lr,
  10. float beta1,
  11. float beta2,
  12. float max_coeff,
  13. float min_coeff,
  14. float eps,
  15. float grad_scale,
  16. int step,
  17. int mode,
  18. int bias_correction,
  19. float decay,
  20. at::Tensor& w_l2_i,
  21. at::Tensor& u_l2_i,
  22. at::Tensor& lamb_coeff_val);
  23. #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
  24. #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
  25. #define CHECK_INPUT(x) \
  26. CHECK_CUDA(x); \
  27. CHECK_CONTIGUOUS(x)
  28. // C++ interface
  29. at::Tensor lamb(at::Tensor& p,
  30. at::Tensor& p_copy,
  31. at::Tensor& m,
  32. at::Tensor& v,
  33. at::Tensor& g,
  34. float lr,
  35. float beta1,
  36. float beta2,
  37. float max_coeff,
  38. float min_coeff,
  39. float eps,
  40. float grad_scale,
  41. int step,
  42. int mode,
  43. int bias_correction,
  44. float decay)
  45. {
  46. CHECK_INPUT(p);
  47. if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
  48. CHECK_INPUT(m);
  49. CHECK_INPUT(v);
  50. CHECK_INPUT(g);
  51. int64_t num_elem = p.numel();
  52. AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
  53. AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
  54. AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
  55. AT_ASSERTM(
  56. p_copy.numel() == num_elem || p_copy.numel() == 0,
  57. "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");
  58. // intermediate for weight L2 reduction
  59. // make sure that the threads per block is at least 512 during the kernel launch otherwise the
  60. // behavious is unexpected
  61. at::Tensor w_l2_i = at::empty(
  62. {512},
  63. p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
  64. : p.type().scalarType()));
  65. // intermediate for update L2 reduction
  66. // make sure that the threads per block is at least 512 during the kernel launch otherwise the
  67. // behavious is unexpected
  68. at::Tensor u_l2_i = at::empty(
  69. {512},
  70. p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
  71. : p.type().scalarType()));
  72. at::Tensor lamb_coeff_val = at::empty(
  73. {1},
  74. p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
  75. : p.type().scalarType()));
  76. fused_lamb_cuda(p,
  77. p_copy,
  78. m,
  79. v,
  80. g,
  81. lr,
  82. beta1,
  83. beta2,
  84. max_coeff,
  85. min_coeff,
  86. eps,
  87. grad_scale,
  88. step,
  89. mode,
  90. bias_correction,
  91. decay,
  92. w_l2_i,
  93. u_l2_i,
  94. lamb_coeff_val);
  95. return lamb_coeff_val;
  96. }
  97. PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
  98. {
  99. m.def("lamb", &lamb, "Adam optimized CUDA implementation with LAMB.");
  100. }