general_kernels.h 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #include <cuda.h>
  2. #include <cuda_fp16.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <cooperative_groups.h>
  6. #include <curand_kernel.h>
  7. #include "context.h"
  8. #include "cublas_wrappers.h"
  9. #define THREADS 256
  10. #define TILE_DIM 32
  11. #define minus_infinity -1 * std::numeric_limits<float>::infinity()
  12. #define FINAL_MASK 0xffffffff
  13. template <typename T>
  14. void launch_fused_add2(T* out,
  15. const T* inp1,
  16. const T* inp2,
  17. int batch_size,
  18. int seq_length,
  19. int hidden_size,
  20. cudaStream_t& stream);
  21. template <typename T>
  22. void launch_fused_add4(T* out,
  23. const T* inp1,
  24. const T* inp2,
  25. const T* inp3,
  26. const T* inp4,
  27. int batch_size,
  28. int seq_length,
  29. int hidden_size,
  30. cudaStream_t& stream);
  31. template <typename T>
  32. void launch_fused_add3(T* out,
  33. const T* inp1,
  34. const T* inp2,
  35. const T* inp3,
  36. int batch_size,
  37. int seq_length,
  38. int hidden_size,
  39. cudaStream_t& stream);