general_kernels.h 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. // Copyright (c) Microsoft Corporation.
  2. // SPDX-License-Identifier: Apache-2.0
  3. // DeepSpeed Team
  4. #include <cuda.h>
  5. #include <cuda_fp16.h>
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #ifdef __HIP_PLATFORM_HCC__
  9. #include <hip/hip_cooperative_groups.h>
  10. #else
  11. #include <cooperative_groups.h>
  12. #endif
  13. #include <curand_kernel.h>
  14. #include "context.h"
  15. #include "cublas_wrappers.h"
  16. #define THREADS 256
  17. #define TILE_DIM 32
  18. #define minus_infinity -1 * std::numeric_limits<float>::infinity()
  19. #define FINAL_MASK 0xffffffff
  20. template <typename T>
  21. void launch_fused_add2(T* out,
  22. const T* inp1,
  23. const T* inp2,
  24. int batch_size,
  25. int seq_length,
  26. int hidden_size,
  27. cudaStream_t& stream);
  28. template <typename T>
  29. void launch_fused_add4(T* out,
  30. const T* inp1,
  31. const T* inp2,
  32. const T* inp3,
  33. const T* inp4,
  34. int batch_size,
  35. int seq_length,
  36. int hidden_size,
  37. cudaStream_t& stream);
  38. template <typename T>
  39. void launch_fused_add3(T* out,
  40. const T* inp1,
  41. const T* inp2,
  42. const T* inp3,
  43. int batch_size,
  44. int seq_length,
  45. int hidden_size,
  46. cudaStream_t& stream);