k8s_utils.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. import logging
  2. import ray._private.utils
  3. logger = logging.getLogger(__name__)
  4. CPU_SHARES_PATH = "/sys/fs/cgroup/cpu/cpu.shares"
  5. CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
  6. PROC_STAT_PATH = "/proc/stat"
  7. container_num_cpus = None
  8. host_num_cpus = None
  9. last_cpu_usage = None
  10. last_system_usage = None
  11. def cpu_percent():
  12. """Estimate CPU usage percent for Ray pod managed by Kubernetes
  13. Operator.
  14. Computed by the following steps
  15. (1) Replicate the logic used by 'docker stats' cli command.
  16. See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
  17. (2) Divide by the number of CPUs available to the container, so that
  18. e.g. full capacity use of 2 CPUs will read as 100%,
  19. rather than 200%.
  20. Step (1) above works by
  21. dividing delta in cgroup's cpuacct.usage by
  22. delta in total host cpu usage, averaged over host's cpus.
  23. Since deltas are not initially available, return 0.0 on first call.
  24. """ # noqa
  25. global last_system_usage
  26. global last_cpu_usage
  27. try:
  28. cpu_usage = _cpu_usage()
  29. system_usage = _system_usage()
  30. # Return 0.0 on first call.
  31. if last_system_usage is None:
  32. cpu_percent = 0.0
  33. else:
  34. cpu_delta = cpu_usage - last_cpu_usage
  35. # "System time passed." (Typically close to clock time.)
  36. system_delta = (system_usage - last_system_usage) / _host_num_cpus()
  37. quotient = cpu_delta / system_delta
  38. cpu_percent = round(quotient * 100 / ray._private.utils.get_k8s_cpus(), 1)
  39. last_system_usage = system_usage
  40. last_cpu_usage = cpu_usage
  41. # Computed percentage might be slightly above 100%.
  42. return min(cpu_percent, 100.0)
  43. except Exception as e:
  44. logger.exception("Error computing CPU usage of Ray Kubernetes pod.", e)
  45. return 0.0
  46. def _cpu_usage():
  47. """Compute total cpu usage of the container in nanoseconds
  48. by reading from cgroup/cpuacct."""
  49. return int(open(CPU_USAGE_PATH).read())
  50. def _system_usage():
  51. """
  52. Computes total CPU usage of the host in nanoseconds.
  53. Logic taken from here:
  54. https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
  55. See also the /proc/stat entry here:
  56. https://man7.org/linux/man-pages/man5/proc.5.html
  57. """ # noqa
  58. cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
  59. parts = cpu_summary_str.split()
  60. assert parts[0] == "cpu"
  61. usage_data = parts[1:8]
  62. total_clock_ticks = sum(int(entry) for entry in usage_data)
  63. # 100 clock ticks per second, 10^9 ns per second
  64. usage_ns = total_clock_ticks * 10 ** 7
  65. return usage_ns
  66. def _host_num_cpus():
  67. """Number of physical CPUs, obtained by parsing /proc/stat."""
  68. global host_num_cpus
  69. if host_num_cpus is None:
  70. proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
  71. split_proc_stat_lines = [line.split() for line in proc_stat_lines]
  72. cpu_lines = [
  73. split_line
  74. for split_line in split_proc_stat_lines
  75. if len(split_line) > 0 and "cpu" in split_line[0]
  76. ]
  77. # Number of lines starting with a word including 'cpu', subtracting
  78. # 1 for the first summary line.
  79. host_num_cpus = len(cpu_lines) - 1
  80. return host_num_cpus