123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- import os
- from .builder import CUDAOpBuilder, installed_cuda_version
- class RaggedOpsBuilder(CUDAOpBuilder):
- BUILD_VAR = "DS_BUILD_RAGGED_DEVICE_OPS"
- NAME = "ragged_device_ops"
- def __init__(self, name=None):
- name = self.NAME if name is None else name
- super().__init__(name=name)
- def absolute_name(self):
- return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'
- def is_compatible(self, verbose=False):
- try:
- import torch
- except ImportError:
- if verbose:
- self.warning("Please install torch if trying to pre-compile inference kernels")
- return False
- cuda_okay = True
- if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
- sys_cuda_major, _ = installed_cuda_version()
- torch_cuda_major = int(torch.version.cuda.split('.')[0])
- cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
- if cuda_capability < 6:
- if verbose:
- self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
- cuda_okay = False
- if cuda_capability >= 8:
- if torch_cuda_major < 11 or sys_cuda_major < 11:
- if verbose:
- self.warning("On Ampere and higher architectures please use CUDA 11+")
- cuda_okay = False
- return super().is_compatible(verbose) and cuda_okay
- def filter_ccs(self, ccs):
- ccs_retained = []
- ccs_pruned = []
- for cc in ccs:
- if int(cc[0]) >= 8:
- # Blocked flash has a dependency on Ampere + newer
- ccs_retained.append(cc)
- else:
- ccs_pruned.append(cc)
- if len(ccs_pruned) > 0:
- self.warning(f"Filtered compute capabilities {ccs_pruned}")
- return ccs_retained
- def get_prefix(self):
- ds_path = self.deepspeed_src_path("deepspeed")
- return "deepspeed" if os.path.isdir(ds_path) else ".."
- def sources(self):
- sources = [
- "inference/v2/kernels/ragged_ops/ragged_ops.cpp",
- "inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp",
- "inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp",
- "inference/v2/kernels/ragged_ops/embed/embed.cpp",
- "inference/v2/kernels/ragged_ops/embed/embed_cuda.cu",
- "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp",
- "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu",
- "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp",
- "inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu",
- "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp",
- "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu",
- "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp",
- "inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu",
- "inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp",
- "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp",
- "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu",
- ]
- prefix = self.get_prefix()
- sources = [os.path.join(prefix, src) for src in sources]
- return sources
- def extra_ldflags(self):
- import dskernels
- lib_path = dskernels.library_path()
- prefix = self.get_prefix()
- lib_path = os.path.join(prefix, lib_path)
- lib_path = self.deepspeed_src_path(lib_path)
- args = [f'-L{lib_path}', '-lblockedflash']
- if self.jit_load:
- args.append(f'-Wl,-rpath,{lib_path}')
- return args
- def include_paths(self):
- sources = [
- 'inference/v2/kernels/includes',
- 'inference/v2/kernels/ragged_ops',
- 'inference/v2/kernels/ragged_ops/atom_builder',
- 'inference/v2/kernels/ragged_ops/blocked_flash',
- 'inference/v2/kernels/ragged_ops/embed',
- 'inference/v2/kernels/ragged_ops/includes',
- 'inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary',
- 'inference/v2/kernels/ragged_ops/logits_gather',
- 'inference/v2/kernels/ragged_ops/moe_gather',
- 'inference/v2/kernels/ragged_ops/moe_scatter',
- 'inference/v2/kernels/ragged_ops/ragged_helpers',
- 'inference/v2/kernels/ragged_ops/top_k_gating',
- ]
- prefix = self.get_prefix()
- sources = [os.path.join(prefix, src) for src in sources]
- return sources
|