ds_kernel_utils.h 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. // Copyright (c) Microsoft Corporation.
  2. // SPDX-License-Identifier: Apache-2.0
  3. // DeepSpeed Team
  4. /*
  5. Centralized header file for preprocessor macros and constants
  6. used throughout the codebase.
  7. */
  8. #pragma once
  9. #include <cuda.h>
  10. #include <cuda_fp16.h>
  11. #ifdef BF16_AVAILABLE
  12. #include <cuda_bf16.h>
  13. #endif
  14. #define DS_HD_INLINE __host__ __device__ __forceinline__
  15. #define DS_D_INLINE __device__ __forceinline__
  16. #ifdef __HIP_PLATFORM_AMD__
  17. // constexpr variant of warpSize for templating
  18. constexpr int hw_warp_size = 64;
  19. #define HALF_PRECISION_AVAILABLE = 1
  20. #include <hip/hip_cooperative_groups.h>
  21. #include <hip/hip_fp16.h>
  22. #else // !__HIP_PLATFORM_AMD__
  23. // constexpr variant of warpSize for templating
  24. constexpr int hw_warp_size = 32;
  25. #if __CUDA_ARCH__ >= 530
  26. #define HALF_PRECISION_AVAILABLE = 1
  27. #define PTX_AVAILABLE
  28. #endif // __CUDA_ARCH__ >= 530
  29. #if __CUDA_ARCH__ >= 800
  30. #define ASYNC_COPY_AVAILABLE
  31. #endif // __CUDA_ARCH__ >= 800
  32. #include <cooperative_groups.h>
  33. #include <cuda_fp16.h>
  34. #endif //__HIP_PLATFORM_AMD__
  35. inline int next_pow2(const int val)
  36. {
  37. int rounded_val = val - 1;
  38. rounded_val |= rounded_val >> 1;
  39. rounded_val |= rounded_val >> 2;
  40. rounded_val |= rounded_val >> 4;
  41. rounded_val |= rounded_val >> 8;
  42. return rounded_val + 1;
  43. }