ds_kernel_utils.h 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. // Copyright (c) Microsoft Corporation.
  2. // SPDX-License-Identifier: Apache-2.0
  3. // DeepSpeed Team
  4. /*
  5. Centralized header file for preprocessor macros and constants
  6. used throughout the codebase.
  7. */
  8. #pragma once
  9. #include <cuda.h>
  10. #define DS_HD_INLINE __host__ __device__ __forceinline__
  11. #define DS_D_INLINE __device__ __forceinline__
  12. #ifdef __HIP_PLATFORM_HCC__
  13. // constexpr variant of warpSize for templating
  14. constexpr int hw_warp_size = 64;
  15. #define HALF_PRECISION_AVAILABLE = 1
  16. #include <hip/hip_cooperative_groups.h>
  17. #include <hip/hip_fp16.h>
  18. #else // !__HIP_PLATFORM_HCC__
  19. // constexpr variant of warpSize for templating
  20. constexpr int hw_warp_size = 32;
  21. #if __CUDA_ARCH__ >= 530
  22. #define HALF_PRECISION_AVAILABLE = 1
  23. #define PTX_AVAILABLE
  24. #endif // __CUDA_ARCH__ >= 530
  25. #if __CUDA_ARCH__ >= 800
  26. #define ASYNC_COPY_AVAILABLE
  27. #endif // __CUDA_ARCH__ >= 800
  28. #include <cooperative_groups.h>
  29. #include <cuda_fp16.h>
  30. #endif //__HIP_PLATFORM_HCC__
  31. inline int next_pow2(const int val)
  32. {
  33. int rounded_val = val - 1;
  34. rounded_val |= rounded_val >> 1;
  35. rounded_val |= rounded_val >> 2;
  36. rounded_val |= rounded_val >> 4;
  37. rounded_val |= rounded_val >> 8;
  38. return rounded_val + 1;
  39. }