env_report.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import os
  5. import torch
  6. import deepspeed
  7. import subprocess
  8. import argparse
  9. from .ops.op_builder.all_ops import ALL_OPS
  10. from .git_version_info import installed_ops, torch_info, accelerator_name
  11. from deepspeed.accelerator import get_accelerator
  12. GREEN = '\033[92m'
  13. RED = '\033[91m'
  14. YELLOW = '\033[93m'
  15. END = '\033[0m'
  16. SUCCESS = f"{GREEN} [SUCCESS] {END}"
  17. OKAY = f"{GREEN}[OKAY]{END}"
  18. WARNING = f"{YELLOW}[WARNING]{END}"
  19. FAIL = f'{RED}[FAIL]{END}'
  20. INFO = '[INFO]'
  21. color_len = len(GREEN) + len(END)
  22. okay = f"{GREEN}[OKAY]{END}"
  23. warning = f"{YELLOW}[WARNING]{END}"
  24. def op_report(verbose=True):
  25. max_dots = 23
  26. max_dots2 = 11
  27. h = ["op name", "installed", "compatible"]
  28. print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
  29. print("DeepSpeed C++/CUDA extension op report")
  30. print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
  31. print("NOTE: Ops not installed will be just-in-time (JIT) compiled at\n"
  32. " runtime if needed. Op compatibility means that your system\n"
  33. " meet the required dependencies to JIT install the op.")
  34. print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
  35. print("JIT compiled ops requires ninja")
  36. ninja_status = OKAY if ninja_installed() else FAIL
  37. print('ninja', "." * (max_dots - 5), ninja_status)
  38. print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
  39. print(h[0], "." * (max_dots - len(h[0])), h[1], "." * (max_dots2 - len(h[1])), h[2])
  40. print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
  41. installed = f"{GREEN}[YES]{END}"
  42. no = f"{YELLOW}[NO]{END}"
  43. for op_name, builder in ALL_OPS.items():
  44. dots = "." * (max_dots - len(op_name))
  45. is_compatible = OKAY if builder.is_compatible(verbose) else no
  46. is_installed = installed if installed_ops.get(op_name,
  47. False) and accelerator_name == get_accelerator()._name else no
  48. dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len))
  49. print(op_name, dots, is_installed, dots2, is_compatible)
  50. print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
  51. def ninja_installed():
  52. try:
  53. import ninja # noqa: F401 # type: ignore
  54. except ImportError:
  55. return False
  56. return True
  57. def nvcc_version():
  58. import torch.utils.cpp_extension
  59. cuda_home = torch.utils.cpp_extension.CUDA_HOME
  60. if cuda_home is None:
  61. return f"{RED} [FAIL] cannot find CUDA_HOME via torch.utils.cpp_extension.CUDA_HOME={torch.utils.cpp_extension.CUDA_HOME} {END}"
  62. try:
  63. output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
  64. except FileNotFoundError:
  65. return f"{RED} [FAIL] nvcc missing {END}"
  66. output_split = output.split()
  67. release_idx = output_split.index("release")
  68. release = output_split[release_idx + 1].replace(',', '').split(".")
  69. return ".".join(release)
  70. def installed_cann_path():
  71. if "ASCEND_HOME_PATH" in os.environ or os.path.exists(os.environ["ASCEND_HOME_PATH"]):
  72. return os.environ["ASCEND_HOME_PATH"]
  73. return None
  74. def installed_cann_version():
  75. import re
  76. ascend_path = installed_cann_path()
  77. if ascend_path is None:
  78. return f"CANN_HOME does not exist, unable to compile NPU op(s)"
  79. cann_version = ""
  80. for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)):
  81. if cann_version:
  82. break
  83. install_files = [file for file in filenames if re.match(r"ascend_.*_install\.info", file)]
  84. if install_files:
  85. filepath = os.path.join(dirpath, install_files[0])
  86. with open(filepath, "r") as f:
  87. for line in f:
  88. if line.find("version") != -1:
  89. cann_version = line.strip().split("=")[-1]
  90. break
  91. return cann_version
  92. def get_shm_size():
  93. try:
  94. shm_stats = os.statvfs('/dev/shm')
  95. except (OSError, FileNotFoundError, ValueError, AttributeError):
  96. return "UNKNOWN", None
  97. shm_size = shm_stats.f_frsize * shm_stats.f_blocks
  98. shm_hbytes = human_readable_size(shm_size)
  99. warn = []
  100. if shm_size < 512 * 1024**2:
  101. warn.append(
  102. f" {YELLOW} [WARNING] /dev/shm size might be too small, if running in docker increase to at least --shm-size='1gb' {END}"
  103. )
  104. if get_accelerator().communication_backend_name() == "nccl":
  105. warn.append(
  106. f" {YELLOW} [WARNING] see more details about NCCL requirements: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#sharing-data {END}"
  107. )
  108. return shm_hbytes, warn
  109. def human_readable_size(size):
  110. units = ['B', 'KB', 'MB', 'GB', 'TB']
  111. i = 0
  112. while size >= 1024 and i < len(units) - 1:
  113. size /= 1024
  114. i += 1
  115. return f'{size:.2f} {units[i]}'
  116. def debug_report():
  117. max_dots = 33
  118. report = [("torch install path", torch.__path__), ("torch version", torch.__version__),
  119. ("deepspeed install path", deepspeed.__path__),
  120. ("deepspeed info", f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}")]
  121. if get_accelerator().device_name() == 'cuda':
  122. hip_version = getattr(torch.version, "hip", None)
  123. report.extend([("torch cuda version", torch.version.cuda), ("torch hip version", hip_version),
  124. ("nvcc version", (None if hip_version else nvcc_version())),
  125. ("deepspeed wheel compiled w.", f"torch {torch_info['version']}, " +
  126. (f"hip {torch_info['hip_version']}" if hip_version else f"cuda {torch_info['cuda_version']}"))
  127. ])
  128. elif get_accelerator().device_name() == 'npu':
  129. import torch_npu
  130. report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']}"),
  131. ("torch_npu install path", torch_npu.__path__), ("torch_npu version", torch_npu.__version__),
  132. ("ascend_cann version", installed_cann_version())])
  133. else:
  134. report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']} ")])
  135. report.append(("shared memory (/dev/shm) size", get_shm_size()))
  136. print("DeepSpeed general environment info:")
  137. for name, value in report:
  138. warns = []
  139. if isinstance(value, tuple):
  140. value, warns = value
  141. print(name, "." * (max_dots - len(name)), value)
  142. if warns:
  143. for warn in warns:
  144. print(warn)
  145. def parse_arguments():
  146. parser = argparse.ArgumentParser()
  147. parser.add_argument('--hide_operator_status',
  148. action='store_true',
  149. help='Suppress display of installation and compatibility statuses of DeepSpeed operators. ')
  150. parser.add_argument('--hide_errors_and_warnings', action='store_true', help='Suppress warning and error messages.')
  151. args = parser.parse_args()
  152. return args
  153. def main(hide_operator_status=False, hide_errors_and_warnings=False):
  154. if not hide_operator_status:
  155. op_report(verbose=not hide_errors_and_warnings)
  156. debug_report()
  157. def cli_main():
  158. args = parse_arguments()
  159. main(hide_operator_status=args.hide_operator_status, hide_errors_and_warnings=args.hide_errors_and_warnings)
  160. if __name__ == "__main__":
  161. main()