123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- import logging
- import ray._private.utils
- logger = logging.getLogger(__name__)
- CPU_SHARES_PATH = "/sys/fs/cgroup/cpu/cpu.shares"
- CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
- PROC_STAT_PATH = "/proc/stat"
- container_num_cpus = None
- host_num_cpus = None
- last_cpu_usage = None
- last_system_usage = None
- def cpu_percent():
- """Estimate CPU usage percent for Ray pod managed by Kubernetes
- Operator.
- Computed by the following steps
- (1) Replicate the logic used by 'docker stats' cli command.
- See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
- (2) Divide by the number of CPUs available to the container, so that
- e.g. full capacity use of 2 CPUs will read as 100%,
- rather than 200%.
- Step (1) above works by
- dividing delta in cgroup's cpuacct.usage by
- delta in total host cpu usage, averaged over host's cpus.
- Since deltas are not initially available, return 0.0 on first call.
- """ # noqa
- global last_system_usage
- global last_cpu_usage
- try:
- cpu_usage = _cpu_usage()
- system_usage = _system_usage()
- # Return 0.0 on first call.
- if last_system_usage is None:
- cpu_percent = 0.0
- else:
- cpu_delta = cpu_usage - last_cpu_usage
- # "System time passed." (Typically close to clock time.)
- system_delta = (system_usage - last_system_usage) / _host_num_cpus()
- quotient = cpu_delta / system_delta
- cpu_percent = round(quotient * 100 / ray._private.utils.get_k8s_cpus(), 1)
- last_system_usage = system_usage
- last_cpu_usage = cpu_usage
- # Computed percentage might be slightly above 100%.
- return min(cpu_percent, 100.0)
- except Exception as e:
- logger.exception("Error computing CPU usage of Ray Kubernetes pod.", e)
- return 0.0
- def _cpu_usage():
- """Compute total cpu usage of the container in nanoseconds
- by reading from cgroup/cpuacct."""
- return int(open(CPU_USAGE_PATH).read())
- def _system_usage():
- """
- Computes total CPU usage of the host in nanoseconds.
- Logic taken from here:
- https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
- See also the /proc/stat entry here:
- https://man7.org/linux/man-pages/man5/proc.5.html
- """ # noqa
- cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
- parts = cpu_summary_str.split()
- assert parts[0] == "cpu"
- usage_data = parts[1:8]
- total_clock_ticks = sum(int(entry) for entry in usage_data)
- # 100 clock ticks per second, 10^9 ns per second
- usage_ns = total_clock_ticks * 10 ** 7
- return usage_ns
- def _host_num_cpus():
- """Number of physical CPUs, obtained by parsing /proc/stat."""
- global host_num_cpus
- if host_num_cpus is None:
- proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
- split_proc_stat_lines = [line.split() for line in proc_stat_lines]
- cpu_lines = [
- split_line
- for split_line in split_proc_stat_lines
- if len(split_line) > 0 and "cpu" in split_line[0]
- ]
- # Number of lines starting with a word including 'cpu', subtracting
- # 1 for the first summary line.
- host_num_cpus = len(cpu_lines) - 1
- return host_num_cpus
|