123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- import os
- import torch
- import deepspeed
- import subprocess
- import argparse
- from .ops.op_builder.all_ops import ALL_OPS
- from .git_version_info import installed_ops, torch_info
- from deepspeed.accelerator import get_accelerator
- GREEN = '\033[92m'
- RED = '\033[91m'
- YELLOW = '\033[93m'
- END = '\033[0m'
- SUCCESS = f"{GREEN} [SUCCESS] {END}"
- OKAY = f"{GREEN}[OKAY]{END}"
- WARNING = f"{YELLOW}[WARNING]{END}"
- FAIL = f'{RED}[FAIL]{END}'
- INFO = '[INFO]'
- color_len = len(GREEN) + len(END)
- okay = f"{GREEN}[OKAY]{END}"
- warning = f"{YELLOW}[WARNING]{END}"
- def op_report(verbose=True):
- max_dots = 23
- max_dots2 = 11
- h = ["op name", "installed", "compatible"]
- print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
- print("DeepSpeed C++/CUDA extension op report")
- print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
- print("NOTE: Ops not installed will be just-in-time (JIT) compiled at\n"
- " runtime if needed. Op compatibility means that your system\n"
- " meet the required dependencies to JIT install the op.")
- print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
- print("JIT compiled ops requires ninja")
- ninja_status = OKAY if ninja_installed() else FAIL
- print('ninja', "." * (max_dots - 5), ninja_status)
- print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
- print(h[0], "." * (max_dots - len(h[0])), h[1], "." * (max_dots2 - len(h[1])), h[2])
- print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
- installed = f"{GREEN}[YES]{END}"
- no = f"{YELLOW}[NO]{END}"
- for op_name, builder in ALL_OPS.items():
- dots = "." * (max_dots - len(op_name))
- is_compatible = OKAY if builder.is_compatible(verbose) else no
- is_installed = installed if installed_ops.get(op_name, False) else no
- dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len))
- print(op_name, dots, is_installed, dots2, is_compatible)
- print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
- def ninja_installed():
- try:
- import ninja # noqa: F401 # type: ignore
- except ImportError:
- return False
- return True
- def nvcc_version():
- import torch.utils.cpp_extension
- cuda_home = torch.utils.cpp_extension.CUDA_HOME
- if cuda_home is None:
- return f"{RED} [FAIL] cannot find CUDA_HOME via torch.utils.cpp_extension.CUDA_HOME={torch.utils.cpp_extension.CUDA_HOME} {END}"
- try:
- output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
- except FileNotFoundError:
- return f"{RED} [FAIL] nvcc missing {END}"
- output_split = output.split()
- release_idx = output_split.index("release")
- release = output_split[release_idx + 1].replace(',', '').split(".")
- return ".".join(release)
- def get_shm_size():
- try:
- shm_stats = os.statvfs('/dev/shm')
- except (OSError, FileNotFoundError, ValueError):
- return "UNKNOWN", None
- shm_size = shm_stats.f_frsize * shm_stats.f_blocks
- shm_hbytes = human_readable_size(shm_size)
- warn = []
- if shm_size < 512 * 1024**2:
- warn.append(
- f" {YELLOW} [WARNING] /dev/shm size might be too small, if running in docker increase to at least --shm-size='1gb' {END}"
- )
- if get_accelerator().communication_backend_name() == "nccl":
- warn.append(
- f" {YELLOW} [WARNING] see more details about NCCL requirements: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#sharing-data {END}"
- )
- return shm_hbytes, warn
- def human_readable_size(size):
- units = ['B', 'KB', 'MB', 'GB', 'TB']
- i = 0
- while size >= 1024 and i < len(units) - 1:
- size /= 1024
- i += 1
- return f'{size:.2f} {units[i]}'
- def debug_report():
- max_dots = 33
- report = [("torch install path", torch.__path__), ("torch version", torch.__version__),
- ("deepspeed install path", deepspeed.__path__),
- ("deepspeed info", f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}")]
- if get_accelerator().device_name() == 'cuda':
- hip_version = getattr(torch.version, "hip", None)
- report.extend([("torch cuda version", torch.version.cuda), ("torch hip version", hip_version),
- ("nvcc version", (None if hip_version else nvcc_version())),
- ("deepspeed wheel compiled w.", f"torch {torch_info['version']}, " +
- (f"hip {torch_info['hip_version']}" if hip_version else f"cuda {torch_info['cuda_version']}"))
- ])
- else:
- report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']} ")])
- report.append(("shared memory (/dev/shm) size", get_shm_size()))
- print("DeepSpeed general environment info:")
- for name, value in report:
- warns = []
- if isinstance(value, tuple):
- value, warns = value
- print(name, "." * (max_dots - len(name)), value)
- if warns:
- for warn in warns:
- print(warn)
- def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument('--hide_operator_status',
- action='store_true',
- help='Suppress display of installation and compatibility statuses of DeepSpeed operators. ')
- parser.add_argument('--hide_errors_and_warnings', action='store_true', help='Suppress warning and error messages.')
- args = parser.parse_args()
- return args
- def main(hide_operator_status=False, hide_errors_and_warnings=False):
- if not hide_operator_status:
- op_report(verbose=not hide_errors_and_warnings)
- debug_report()
- def cli_main():
- args = parse_arguments()
- main(hide_operator_status=args.hide_operator_status, hide_errors_and_warnings=args.hide_errors_and_warnings)
- if __name__ == "__main__":
- main()
|