deepspeed_py_copy.h 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. /*
  2. Copyright 2020 The Microsoft DeepSpeed Team
  3. Licensed under the MIT license.
  4. Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
  5. */
  6. #if (__x86_64__ || __i386__)
  7. #include <cpuid.h>
  8. #include <x86intrin.h>
  9. #endif
  10. #include <deepspeed_aio_common.h>
  11. #include <stdlib.h>
  12. #include <torch/extension.h>
  13. #define TILE (1024 * 1024 * 1024)
  14. #if defined(__AVX512__)
  15. #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
  16. #define SIMD_LOAD(x) _mm512_loadu_ps(x)
  17. #define SIMD_SET(x) _mm512_set1_ps(x)
  18. #define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
  19. #define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
  20. #define SIMD_SQRT(x) _mm512_sqrt_ps(x)
  21. #define SIMD_DIV(x, y) _mm512_div_ps(x, y)
  22. #define SIMD_WIDTH 16
  23. #else
  24. #if defined(__AVX256__)
  25. #define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
  26. #define SIMD_LOAD(x) _mm256_loadu_ps(x)
  27. #define SIMD_SET(x) _mm256_set1_ps(x)
  28. #define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
  29. #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
  30. #define SIMD_SQRT(x) _mm256_sqrt_ps(x)
  31. #define SIMD_DIV(x, y) _mm256_div_ps(x, y)
  32. #define SIMD_WIDTH 8
  33. #endif
  34. #endif
  35. int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);