ragged_ops.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import os
  5. from .builder import CUDAOpBuilder, installed_cuda_version
  6. class RaggedOpsBuilder(CUDAOpBuilder):
  7. BUILD_VAR = "DS_BUILD_RAGGED_DEVICE_OPS"
  8. NAME = "ragged_device_ops"
  9. def __init__(self, name=None):
  10. name = self.NAME if name is None else name
  11. super().__init__(name=name)
  12. def absolute_name(self):
  13. return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'
  14. def is_compatible(self, verbose=False):
  15. try:
  16. import torch
  17. except ImportError:
  18. if verbose:
  19. self.warning("Please install torch if trying to pre-compile inference kernels")
  20. return False
  21. cuda_okay = True
  22. if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
  23. sys_cuda_major, _ = installed_cuda_version()
  24. torch_cuda_major = int(torch.version.cuda.split('.')[0])
  25. cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
  26. if cuda_capability < 6:
  27. if verbose:
  28. self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
  29. cuda_okay = False
  30. if cuda_capability >= 8:
  31. if torch_cuda_major < 11 or sys_cuda_major < 11:
  32. if verbose:
  33. self.warning("On Ampere and higher architectures please use CUDA 11+")
  34. cuda_okay = False
  35. return super().is_compatible(verbose) and cuda_okay
  36. def filter_ccs(self, ccs):
  37. ccs_retained = []
  38. ccs_pruned = []
  39. for cc in ccs:
  40. if int(cc[0]) >= 8:
  41. # Blocked flash has a dependency on Ampere + newer
  42. ccs_retained.append(cc)
  43. else:
  44. ccs_pruned.append(cc)
  45. if len(ccs_pruned) > 0:
  46. self.warning(f"Filtered compute capabilities {ccs_pruned}")
  47. return ccs_retained
  48. def get_prefix(self):
  49. ds_path = self.deepspeed_src_path("deepspeed")
  50. return "deepspeed" if os.path.isdir(ds_path) else ".."
  51. def sources(self):
  52. sources = [
  53. "inference/v2/kernels/ragged_ops/ragged_ops.cpp",
  54. "inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp",
  55. "inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp",
  56. "inference/v2/kernels/ragged_ops/embed/embed.cpp",
  57. "inference/v2/kernels/ragged_ops/embed/embed_cuda.cu",
  58. "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp",
  59. "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu",
  60. "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp",
  61. "inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu",
  62. "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp",
  63. "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu",
  64. "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp",
  65. "inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu",
  66. "inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp",
  67. "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp",
  68. "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu",
  69. ]
  70. prefix = self.get_prefix()
  71. sources = [os.path.join(prefix, src) for src in sources]
  72. return sources
  73. def extra_ldflags(self):
  74. import dskernels
  75. lib_path = dskernels.library_path()
  76. prefix = self.get_prefix()
  77. lib_path = os.path.join(prefix, lib_path)
  78. lib_path = self.deepspeed_src_path(lib_path)
  79. args = [f'-L{lib_path}', '-lblockedflash']
  80. if self.jit_load:
  81. args.append(f'-Wl,-rpath,{lib_path}')
  82. return args
  83. def include_paths(self):
  84. sources = [
  85. 'inference/v2/kernels/includes',
  86. 'inference/v2/kernels/ragged_ops',
  87. 'inference/v2/kernels/ragged_ops/atom_builder',
  88. 'inference/v2/kernels/ragged_ops/blocked_flash',
  89. 'inference/v2/kernels/ragged_ops/embed',
  90. 'inference/v2/kernels/ragged_ops/includes',
  91. 'inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary',
  92. 'inference/v2/kernels/ragged_ops/logits_gather',
  93. 'inference/v2/kernels/ragged_ops/moe_gather',
  94. 'inference/v2/kernels/ragged_ops/moe_scatter',
  95. 'inference/v2/kernels/ragged_ops/ragged_helpers',
  96. 'inference/v2/kernels/ragged_ops/top_k_gating',
  97. ]
  98. prefix = self.get_prefix()
  99. sources = [os.path.join(prefix, src) for src in sources]
  100. return sources