123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- import os
- import re
- import sys
- import time
- import importlib
- from pathlib import Path
- import subprocess
- import shlex
- import shutil
- import tempfile
- import distutils.ccompiler
- import distutils.log
- import distutils.sysconfig
- from distutils.errors import CompileError, LinkError
- from abc import ABC, abstractmethod
- from typing import List
- YELLOW = '\033[93m'
- END = '\033[0m'
- WARNING = f"{YELLOW} [WARNING] {END}"
- DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
- DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
- try:
- import torch
- except ImportError:
- print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
- else:
- TORCH_MAJOR = int(torch.__version__.split('.')[0])
- TORCH_MINOR = int(torch.__version__.split('.')[1])
- class MissingCUDAException(Exception):
- pass
- class CUDAMismatchException(Exception):
- pass
- def installed_cuda_version(name=""):
- import torch.utils.cpp_extension
- cuda_home = torch.utils.cpp_extension.CUDA_HOME
- if cuda_home is None:
- raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")
- # Ensure there is not a cuda version mismatch between torch and nvcc compiler
- output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
- output_split = output.split()
- release_idx = output_split.index("release")
- release = output_split[release_idx + 1].replace(',', '').split(".")
- # Ignore patch versions, only look at major + minor
- cuda_major, cuda_minor = release[:2]
- return int(cuda_major), int(cuda_minor)
- def get_default_compute_capabilities():
- compute_caps = DEFAULT_COMPUTE_CAPABILITIES
- import torch.utils.cpp_extension
- if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
- if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
- # Special treatment of CUDA 11.0 because compute_86 is not supported.
- compute_caps += ";8.0"
- else:
- compute_caps += ";8.0;8.6"
- return compute_caps
- # list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
- # to build deepspeed and system-wide installed cuda 11.2
- cuda_minor_mismatch_ok = {
- 10: ["10.0", "10.1", "10.2"],
- 11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
- 12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6"],
- }
- def assert_no_cuda_mismatch(name=""):
- cuda_major, cuda_minor = installed_cuda_version(name)
- sys_cuda_version = f'{cuda_major}.{cuda_minor}'
- torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
- # This is a show-stopping error, should probably not proceed past this
- if sys_cuda_version != torch_cuda_version:
- if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
- and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
- print(f"Installed CUDA version {sys_cuda_version} does not match the "
- f"version torch was compiled with {torch.version.cuda} "
- "but since the APIs are compatible, accepting this combination")
- return True
- elif os.getenv("DS_SKIP_CUDA_CHECK", "0") == "1":
- print(
- f"{WARNING} DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
- f"version torch was compiled with {torch.version.cuda}."
- "Detected `DS_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
- )
- return True
- raise CUDAMismatchException(
- f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
- f"version torch was compiled with {torch.version.cuda}, unable to compile "
- "cuda/cpp extensions without a matching cuda version.")
- return True
- class OpBuilder(ABC):
- _rocm_version = None
- _rocm_gpu_arch = None
- _rocm_wavefront_size = None
- _is_rocm_pytorch = None
- _is_sycl_enabled = None
- _loaded_ops = {}
- def __init__(self, name):
- self.name = name
- self.jit_mode = False
- self.build_for_cpu = False
- self.enable_bf16 = False
- self.error_log = None
- @abstractmethod
- def absolute_name(self):
- '''
- Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
- will be installed as something like: deepspeed/ops/adam/cpu_adam.so
- '''
- pass
- @abstractmethod
- def sources(self):
- '''
- Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
- '''
- pass
- def hipify_extension(self):
- pass
- def sycl_extension(self):
- pass
- @staticmethod
- def validate_torch_version(torch_info):
- install_torch_version = torch_info['version']
- current_torch_version = ".".join(torch.__version__.split('.')[:2])
- if install_torch_version != current_torch_version:
- raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
- "with a different version than what is being used at runtime. "
- f"Please re-install DeepSpeed or switch torch versions. "
- f"Install torch version={install_torch_version}, "
- f"Runtime torch version={current_torch_version}")
- @staticmethod
- def validate_torch_op_version(torch_info):
- if not OpBuilder.is_rocm_pytorch():
- current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
- install_cuda_version = torch_info['cuda_version']
- if install_cuda_version != current_cuda_version:
- raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
- "with a different version than what is being used at runtime. "
- f"Please re-install DeepSpeed or switch torch versions. "
- f"Install CUDA version={install_cuda_version}, "
- f"Runtime CUDA version={current_cuda_version}")
- else:
- current_hip_version = ".".join(torch.version.hip.split('.')[:2])
- install_hip_version = torch_info['hip_version']
- if install_hip_version != current_hip_version:
- raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
- "with a different version than what is being used at runtime. "
- f"Please re-install DeepSpeed or switch torch versions. "
- f"Install HIP version={install_hip_version}, "
- f"Runtime HIP version={current_hip_version}")
- @staticmethod
- def is_rocm_pytorch():
- if OpBuilder._is_rocm_pytorch is not None:
- return OpBuilder._is_rocm_pytorch
- _is_rocm_pytorch = False
- try:
- import torch
- except ImportError:
- pass
- else:
- if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
- _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
- if _is_rocm_pytorch:
- from torch.utils.cpp_extension import ROCM_HOME
- _is_rocm_pytorch = ROCM_HOME is not None
- OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
- return OpBuilder._is_rocm_pytorch
- @staticmethod
- def is_sycl_enabled():
- if OpBuilder._is_sycl_enabled is not None:
- return OpBuilder._is_sycl_enabled
- _is_sycl_enabled = False
- try:
- result = subprocess.run(["c2s", "--version"], capture_output=True)
- except:
- pass
- else:
- _is_sycl_enabled = True
- OpBuilder._is_sycl_enabled = _is_sycl_enabled
- return OpBuilder._is_sycl_enabled
- @staticmethod
- def installed_rocm_version():
- if OpBuilder._rocm_version:
- return OpBuilder._rocm_version
- ROCM_MAJOR = '0'
- ROCM_MINOR = '0'
- ROCM_VERSION_DEV_RAW = ""
- if OpBuilder.is_rocm_pytorch():
- from torch.utils.cpp_extension import ROCM_HOME
- rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version")
- if rocm_ver_file.is_file():
- with open(rocm_ver_file, 'r') as file:
- ROCM_VERSION_DEV_RAW = file.read()
- elif "rocm" in torch.__version__:
- ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
- if ROCM_VERSION_DEV_RAW != "":
- ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
- ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
- else:
- # Look in /usr/include/rocm-version.h
- rocm_ver_file = Path("/usr/include/rocm_version.h")
- if rocm_ver_file.is_file():
- with open(rocm_ver_file, 'r') as file:
- for ln in file.readlines():
- if "#define ROCM_VERSION_MAJOR" in ln:
- ROCM_MAJOR = re.findall(r'\S+', ln)[2]
- elif "#define ROCM_VERSION_MINOR" in ln:
- ROCM_MINOR = re.findall(r'\S+', ln)[2]
- if ROCM_MAJOR == '0':
- assert False, "Could not detect ROCm version"
- OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
- return OpBuilder._rocm_version
- @staticmethod
- def get_rocm_gpu_arch():
- if OpBuilder._rocm_gpu_arch:
- return OpBuilder._rocm_gpu_arch
- rocm_info = Path("/opt/rocm/bin/rocminfo")
- if (not rocm_info.is_file()):
- rocm_info = Path("rocminfo")
- rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
- try:
- result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
- rocm_gpu_arch = result.decode('utf-8').strip()
- except subprocess.CalledProcessError:
- rocm_gpu_arch = ""
- OpBuilder._rocm_gpu_arch = rocm_gpu_arch
- return OpBuilder._rocm_gpu_arch
- @staticmethod
- def get_rocm_wavefront_size():
- if OpBuilder._rocm_wavefront_size:
- return OpBuilder._rocm_wavefront_size
- rocm_info = Path("/opt/rocm/bin/rocminfo")
- if (not rocm_info.is_file()):
- rocm_info = Path("rocminfo")
- rocm_wavefront_size_cmd = str(
- rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
- try:
- result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
- rocm_wavefront_size = result.decode('utf-8').strip()
- except subprocess.CalledProcessError:
- rocm_wavefront_size = "32"
- OpBuilder._rocm_wavefront_size = rocm_wavefront_size
- return OpBuilder._rocm_wavefront_size
- def include_paths(self):
- '''
- Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
- '''
- return []
- def nvcc_args(self):
- '''
- Returns optional list of compiler flags to forward to nvcc when building CUDA sources
- '''
- return []
- def cxx_args(self):
- '''
- Returns optional list of compiler flags to forward to the build
- '''
- return []
- def is_compatible(self, verbose=False):
- '''
- Check if all non-python dependencies are satisfied to build this op
- '''
- return True
- def extra_ldflags(self):
- return []
- def has_function(self, funcname, libraries, library_dirs=None, verbose=False):
- '''
- Test for existence of a function within a tuple of libraries.
- This is used as a smoke test to check whether a certain library is available.
- As a test, this creates a simple C program that calls the specified function,
- and then distutils is used to compile that program and link it with the specified libraries.
- Returns True if both the compile and link are successful, False otherwise.
- '''
- tempdir = None # we create a temporary directory to hold various files
- filestderr = None # handle to open file to which we redirect stderr
- oldstderr = None # file descriptor for stderr
- try:
- # Echo compile and link commands that are used.
- if verbose:
- distutils.log.set_verbosity(1)
- # Create a compiler object.
- compiler = distutils.ccompiler.new_compiler(verbose=verbose)
- # Configure compiler and linker to build according to Python install.
- distutils.sysconfig.customize_compiler(compiler)
- # Create a temporary directory to hold test files.
- tempdir = tempfile.mkdtemp()
- # Define a simple C program that calls the function in question
- prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
- # Write the test program to a file.
- filename = os.path.join(tempdir, 'test.c')
- with open(filename, 'w') as f:
- f.write(prog)
- # Redirect stderr file descriptor to a file to silence compile/link warnings.
- if not verbose:
- filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
- oldstderr = os.dup(sys.stderr.fileno())
- os.dup2(filestderr.fileno(), sys.stderr.fileno())
- # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
- # Otherwise, a local directory will be used instead of tempdir
- drive, driveless_filename = os.path.splitdrive(filename)
- root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
- output_dir = os.path.join(drive, root_dir)
- # Attempt to compile the C program into an object file.
- cflags = shlex.split(os.environ.get('CFLAGS', ""))
- objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
- # Attempt to link the object file into an executable.
- # Be sure to tack on any libraries that have been specified.
- ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
- compiler.link_executable(objs,
- os.path.join(tempdir, 'a.out'),
- extra_preargs=self.strip_empty_entries(ldflags),
- libraries=libraries,
- library_dirs=library_dirs)
- # Compile and link succeeded
- return True
- except CompileError:
- return False
- except LinkError:
- return False
- except:
- return False
- finally:
- # Restore stderr file descriptor and close the stderr redirect file.
- if oldstderr is not None:
- os.dup2(oldstderr, sys.stderr.fileno())
- if filestderr is not None:
- filestderr.close()
- # Delete the temporary directory holding the test program and stderr files.
- if tempdir is not None:
- shutil.rmtree(tempdir)
- def strip_empty_entries(self, args):
- '''
- Drop any empty strings from the list of compile and link flags
- '''
- return [x for x in args if len(x) > 0]
- def cpu_arch(self):
- try:
- from cpuinfo import get_cpu_info
- except ImportError as e:
- cpu_info = self._backup_cpuinfo()
- if cpu_info is None:
- return "-march=native"
- try:
- cpu_info = get_cpu_info()
- except Exception as e:
- self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
- "falling back to `lscpu` to get this information.")
- cpu_info = self._backup_cpuinfo()
- if cpu_info is None:
- return "-march=native"
- if cpu_info['arch'].startswith('PPC_'):
- # gcc does not provide -march on PowerPC, use -mcpu instead
- return '-mcpu=native'
- return '-march=native'
- def is_cuda_enable(self):
- try:
- assert_no_cuda_mismatch(self.name)
- return '-D__ENABLE_CUDA__'
- except MissingCUDAException:
- print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
- "only cpu ops can be compiled!")
- return '-D__DISABLE_CUDA__'
- return '-D__DISABLE_CUDA__'
- def _backup_cpuinfo(self):
- # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
- if not self.command_exists('lscpu'):
- self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
- "to detect the CPU architecture. 'lscpu' does not appear to exist on "
- "your system, will fall back to use -march=native and non-vectorized execution.")
- return None
- result = subprocess.check_output(['lscpu'])
- result = result.decode('utf-8').strip().lower()
- cpu_info = {}
- cpu_info['arch'] = None
- cpu_info['flags'] = ""
- if 'genuineintel' in result or 'authenticamd' in result:
- cpu_info['arch'] = 'X86_64'
- if 'avx512' in result:
- cpu_info['flags'] += 'avx512,'
- elif 'avx512f' in result:
- cpu_info['flags'] += 'avx512f,'
- if 'avx2' in result:
- cpu_info['flags'] += 'avx2'
- elif 'ppc64le' in result:
- cpu_info['arch'] = "PPC_"
- return cpu_info
- def simd_width(self):
- try:
- from cpuinfo import get_cpu_info
- except ImportError as e:
- cpu_info = self._backup_cpuinfo()
- if cpu_info is None:
- return '-D__SCALAR__'
- try:
- cpu_info = get_cpu_info()
- except Exception as e:
- self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
- "falling back to `lscpu` to get this information.")
- cpu_info = self._backup_cpuinfo()
- if cpu_info is None:
- return '-D__SCALAR__'
- if cpu_info['arch'] == 'X86_64':
- if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
- return '-D__AVX512__'
- elif 'avx2' in cpu_info['flags']:
- return '-D__AVX256__'
- return '-D__SCALAR__'
- def command_exists(self, cmd):
- if '|' in cmd:
- cmds = cmd.split("|")
- else:
- cmds = [cmd]
- valid = False
- for cmd in cmds:
- safe_cmd = ["bash", "-c", f"type {cmd}"]
- result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
- valid = valid or result.wait() == 0
- if not valid and len(cmds) > 1:
- print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
- elif not valid and len(cmds) == 1:
- print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
- return valid
- def warning(self, msg):
- self.error_log = f"{msg}"
- print(f"{WARNING} {msg}")
- def deepspeed_src_path(self, code_path):
- if os.path.isabs(code_path):
- return code_path
- else:
- return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
- def builder(self):
- from torch.utils.cpp_extension import CppExtension
- include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
- return CppExtension(name=self.absolute_name(),
- sources=self.strip_empty_entries(self.sources()),
- include_dirs=include_dirs,
- extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
- extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
- def load(self, verbose=True):
- if self.name in __class__._loaded_ops:
- return __class__._loaded_ops[self.name]
- from deepspeed.git_version_info import installed_ops, torch_info, accelerator_name
- from deepspeed.accelerator import get_accelerator
- if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name:
- # Ensure the op we're about to load was compiled with the same
- # torch/cuda versions we are currently using at runtime.
- self.validate_torch_version(torch_info)
- if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
- self.validate_torch_op_version(torch_info)
- op_module = importlib.import_module(self.absolute_name())
- __class__._loaded_ops[self.name] = op_module
- return op_module
- else:
- return self.jit_load(verbose)
- def jit_load(self, verbose=True):
- if not self.is_compatible(verbose):
- raise RuntimeError(
- f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
- )
- try:
- import ninja # noqa: F401 # type: ignore
- except ImportError:
- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
- if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
- self.build_for_cpu = not torch.cuda.is_available()
- self.jit_mode = True
- from torch.utils.cpp_extension import load
- start_build = time.time()
- sources = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.sources()]
- extra_include_paths = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.include_paths()]
- # Torch will try and apply whatever CCs are in the arch list at compile time,
- # we have already set the intended targets ourselves we know that will be
- # needed at runtime. This prevents CC collisions such as multiple __half
- # implementations. Stash arch list to reset after build.
- torch_arch_list = None
- if "TORCH_CUDA_ARCH_LIST" in os.environ:
- torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
- os.environ["TORCH_CUDA_ARCH_LIST"] = ""
- nvcc_args = self.strip_empty_entries(self.nvcc_args())
- cxx_args = self.strip_empty_entries(self.cxx_args())
- if isinstance(self, CUDAOpBuilder):
- if not self.build_for_cpu and self.enable_bf16:
- cxx_args.append("-DBF16_AVAILABLE")
- nvcc_args.append("-DBF16_AVAILABLE")
- nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__")
- nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__")
- nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
- if self.is_rocm_pytorch():
- cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
- os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
- cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
- op_module = load(name=self.name,
- sources=self.strip_empty_entries(sources),
- extra_include_paths=self.strip_empty_entries(extra_include_paths),
- extra_cflags=cxx_args,
- extra_cuda_cflags=nvcc_args,
- extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
- verbose=verbose)
- build_duration = time.time() - start_build
- if verbose:
- print(f"Time to load {self.name} op: {build_duration} seconds")
- # Reset arch list so we are not silently removing it for other possible use cases
- if torch_arch_list:
- os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
- __class__._loaded_ops[self.name] = op_module
- return op_module
- class CUDAOpBuilder(OpBuilder):
- def compute_capability_args(self, cross_compile_archs=None):
- """
- Returns nvcc compute capability compile flags.
- 1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
- 2. If neither is set default compute capabilities will be used
- 3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
- Format:
- - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
- TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
- TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
- - `cross_compile_archs` uses ; separator.
- """
- ccs = []
- if self.jit_mode:
- # Compile for underlying architectures since we know those at runtime
- for i in range(torch.cuda.device_count()):
- CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
- cc = f"{CC_MAJOR}.{CC_MINOR}"
- if cc not in ccs:
- ccs.append(cc)
- ccs = sorted(ccs)
- ccs[-1] += '+PTX'
- else:
- # Cross-compile mode, compile for various architectures
- # env override takes priority
- cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
- if cross_compile_archs_env is not None:
- if cross_compile_archs is not None:
- print(
- f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
- )
- cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
- else:
- if cross_compile_archs is None:
- cross_compile_archs = get_default_compute_capabilities()
- ccs = cross_compile_archs.split(';')
- ccs = self.filter_ccs(ccs)
- if len(ccs) == 0:
- raise RuntimeError(
- f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
- args = []
- self.enable_bf16 = True
- for cc in ccs:
- num = cc[0] + cc[2]
- args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
- if cc.endswith('+PTX'):
- args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
- if int(cc[0]) <= 7:
- self.enable_bf16 = False
- return args
- def filter_ccs(self, ccs: List[str]):
- """
- Prune any compute capabilities that are not compatible with the builder. Should log
- which CCs have been pruned.
- """
- return ccs
- def version_dependent_macros(self):
- # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
- version_ge_1_1 = []
- if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
- version_ge_1_1 = ['-DVERSION_GE_1_1']
- version_ge_1_3 = []
- if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
- version_ge_1_3 = ['-DVERSION_GE_1_3']
- version_ge_1_5 = []
- if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
- version_ge_1_5 = ['-DVERSION_GE_1_5']
- return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
- def is_compatible(self, verbose=False):
- return super().is_compatible(verbose)
- def builder(self):
- try:
- if not self.is_rocm_pytorch():
- assert_no_cuda_mismatch(self.name)
- self.build_for_cpu = False
- except MissingCUDAException:
- self.build_for_cpu = True
- if self.build_for_cpu:
- from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
- else:
- from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
- include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
- compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
- {'cxx': self.strip_empty_entries(self.cxx_args()), \
- 'nvcc': self.strip_empty_entries(self.nvcc_args())}
- if not self.build_for_cpu and self.enable_bf16:
- compile_args['cxx'].append("-DBF16_AVAILABLE")
- compile_args['nvcc'].append("-DBF16_AVAILABLE")
- if self.is_rocm_pytorch():
- compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1")
- #cxx compiler args are required to compile cpp files
- compile_args['cxx'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
- #nvcc compiler args are required to compile hip files
- compile_args['nvcc'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
- if self.get_rocm_gpu_arch():
- os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
- cuda_ext = ExtensionBuilder(name=self.absolute_name(),
- sources=self.strip_empty_entries(self.sources()),
- include_dirs=include_dirs,
- libraries=self.strip_empty_entries(self.libraries_args()),
- extra_compile_args=compile_args,
- extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
- if self.is_rocm_pytorch():
- # hip converts paths to absolute, this converts back to relative
- sources = cuda_ext.sources
- curr_file = Path(__file__).parent.parent # ds root
- for i in range(len(sources)):
- src = Path(sources[i])
- if src.is_absolute():
- sources[i] = str(src.relative_to(curr_file))
- else:
- sources[i] = str(src)
- cuda_ext.sources = sources
- return cuda_ext
- def hipify_extension(self):
- if self.is_rocm_pytorch():
- from torch.utils.hipify import hipify_python
- hipify_python.hipify(
- project_directory=os.getcwd(),
- output_directory=os.getcwd(),
- header_include_dirs=self.include_paths(),
- includes=[os.path.join(os.getcwd(), '*')],
- extra_files=[os.path.abspath(s) for s in self.sources()],
- show_detailed=True,
- is_pytorch_extension=True,
- hipify_extra_files_only=True,
- )
- def cxx_args(self):
- if sys.platform == "win32":
- return ['-O2']
- else:
- return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
- def nvcc_args(self):
- if self.build_for_cpu:
- return []
- args = ['-O3']
- if self.is_rocm_pytorch():
- ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
- args += [
- '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
- '-U__HIP_NO_HALF2_OPERATORS__',
- '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
- '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
- ]
- else:
- try:
- nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
- if nvcc_threads <= 0:
- raise ValueError("")
- except ValueError:
- nvcc_threads = min(os.cpu_count(), 8)
- cuda_major, cuda_minor = installed_cuda_version()
- if cuda_major > 10:
- if cuda_major == 12 and cuda_minor >= 5:
- std_lib = '-std=c++20'
- else:
- std_lib = '-std=c++17'
- else:
- std_lib = '-std=c++14'
- args += [
- '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib,
- '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
- f'--threads={nvcc_threads}'
- ]
- if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
- args.append('--ptxas-options=-v')
- args += self.compute_capability_args()
- return args
- def libraries_args(self):
- if self.build_for_cpu:
- return []
- if sys.platform == "win32":
- return ['cublas', 'curand']
- else:
- return []
- class TorchCPUOpBuilder(CUDAOpBuilder):
- def get_cuda_lib64_path(self):
- import torch
- if not self.is_rocm_pytorch():
- CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
- if not os.path.exists(CUDA_LIB64):
- CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
- else:
- CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
- return CUDA_LIB64
- def extra_ldflags(self):
- if self.build_for_cpu:
- return ['-fopenmp']
- if not self.is_rocm_pytorch():
- ld_flags = ['-lcurand']
- if not self.build_for_cpu:
- ld_flags.append(f'-L{self.get_cuda_lib64_path()}')
- return ld_flags
- return []
- def cxx_args(self):
- args = []
- if not self.build_for_cpu:
- CUDA_LIB64 = self.get_cuda_lib64_path()
- args += super().cxx_args()
- args += [
- f'-L{CUDA_LIB64}',
- '-lcudart',
- '-lcublas',
- '-g',
- ]
- CPU_ARCH = self.cpu_arch()
- SIMD_WIDTH = self.simd_width()
- CUDA_ENABLE = self.is_cuda_enable()
- args += [
- CPU_ARCH,
- '-fopenmp',
- SIMD_WIDTH,
- CUDA_ENABLE,
- ]
- return args
|