deepspeed_py_copy.h 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. // Copyright (c) Microsoft Corporation.
  2. // SPDX-License-Identifier: Apache-2.0
  3. // DeepSpeed Team
  4. /*
  5. Copyright 2020 The Microsoft DeepSpeed Team
  6. Licensed under the MIT license.
  7. Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
  8. */
  9. #if (__x86_64__ || __i386__)
  10. #include <cpuid.h>
  11. #include <x86intrin.h>
  12. #endif
  13. #include <deepspeed_aio_common.h>
  14. #include <stdlib.h>
  15. #include <torch/extension.h>
  16. #define TILE (1024 * 1024 * 1024)
  17. #if defined(__AVX512__)
  18. #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
  19. #define SIMD_LOAD(x) _mm512_loadu_ps(x)
  20. #define SIMD_SET(x) _mm512_set1_ps(x)
  21. #define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
  22. #define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
  23. #define SIMD_SQRT(x) _mm512_sqrt_ps(x)
  24. #define SIMD_DIV(x, y) _mm512_div_ps(x, y)
  25. #define SIMD_WIDTH 16
  26. #else
  27. #if defined(__AVX256__)
  28. #define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
  29. #define SIMD_LOAD(x) _mm256_loadu_ps(x)
  30. #define SIMD_SET(x) _mm256_set1_ps(x)
  31. #define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
  32. #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
  33. #define SIMD_SQRT(x) _mm256_sqrt_ps(x)
  34. #define SIMD_DIV(x, y) _mm256_div_ps(x, y)
  35. #define SIMD_WIDTH 8
  36. #endif
  37. #endif
  38. int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);