thermald.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. #!/usr/bin/env python3
  2. import datetime
  3. import os
  4. import queue
  5. import threading
  6. import time
  7. from collections import OrderedDict, namedtuple
  8. from pathlib import Path
  9. from typing import Dict, Optional, Tuple
  10. import psutil
  11. import cereal.messaging as messaging
  12. from cereal import log
  13. from common.dict_helpers import strip_deprecated_keys
  14. from common.filter_simple import FirstOrderFilter
  15. from common.params import Params
  16. from common.realtime import DT_TRML, sec_since_boot
  17. from selfdrive.controls.lib.alertmanager import set_offroad_alert
  18. from system.hardware import HARDWARE, TICI, AGNOS
  19. from selfdrive.loggerd.config import get_available_percent
  20. from selfdrive.statsd import statlog
  21. from system.swaglog import cloudlog
  22. from selfdrive.thermald.power_monitoring import PowerMonitoring
  23. from selfdrive.thermald.fan_controller import TiciFanController
  24. from system.version import terms_version, training_version
  25. ThermalStatus = log.DeviceState.ThermalStatus
  26. NetworkType = log.DeviceState.NetworkType
  27. NetworkStrength = log.DeviceState.NetworkStrength
  28. CURRENT_TAU = 15. # 15s time constant
  29. TEMP_TAU = 5. # 5s time constant
  30. DISCONNECT_TIMEOUT = 5. # wait 5 seconds before going offroad after disconnect so you get an alert
  31. PANDA_STATES_TIMEOUT = int(1000 * 1.5 * DT_TRML) # 1.5x the expected pandaState frequency
  32. ThermalBand = namedtuple("ThermalBand", ['min_temp', 'max_temp'])
  33. HardwareState = namedtuple("HardwareState", ['network_type', 'network_info', 'network_strength', 'network_stats', 'network_metered', 'nvme_temps', 'modem_temps'])
  34. # List of thermal bands. We will stay within this region as long as we are within the bounds.
  35. # When exiting the bounds, we'll jump to the lower or higher band. Bands are ordered in the dict.
  36. THERMAL_BANDS = OrderedDict({
  37. ThermalStatus.green: ThermalBand(None, 80.0),
  38. ThermalStatus.yellow: ThermalBand(75.0, 96.0),
  39. ThermalStatus.red: ThermalBand(80.0, 107.),
  40. ThermalStatus.danger: ThermalBand(94.0, None),
  41. })
  42. # Override to highest thermal band when offroad and above this temp
  43. OFFROAD_DANGER_TEMP = 79.5
  44. prev_offroad_states: Dict[str, Tuple[bool, Optional[str]]] = {}
  45. tz_by_type: Optional[Dict[str, int]] = None
  46. def populate_tz_by_type():
  47. global tz_by_type
  48. tz_by_type = {}
  49. for n in os.listdir("/sys/devices/virtual/thermal"):
  50. if not n.startswith("thermal_zone"):
  51. continue
  52. with open(os.path.join("/sys/devices/virtual/thermal", n, "type")) as f:
  53. tz_by_type[f.read().strip()] = int(n.lstrip("thermal_zone"))
  54. def read_tz(x):
  55. if x is None:
  56. return 0
  57. if isinstance(x, str):
  58. if tz_by_type is None:
  59. populate_tz_by_type()
  60. x = tz_by_type[x]
  61. try:
  62. with open(f"/sys/devices/virtual/thermal/thermal_zone{x}/temp") as f:
  63. return int(f.read())
  64. except FileNotFoundError:
  65. return 0
  66. def read_thermal(thermal_config):
  67. dat = messaging.new_message('deviceState')
  68. dat.deviceState.cpuTempC = [read_tz(z) / thermal_config.cpu[1] for z in thermal_config.cpu[0]]
  69. dat.deviceState.gpuTempC = [read_tz(z) / thermal_config.gpu[1] for z in thermal_config.gpu[0]]
  70. dat.deviceState.memoryTempC = read_tz(thermal_config.mem[0]) / thermal_config.mem[1]
  71. dat.deviceState.ambientTempC = read_tz(thermal_config.ambient[0]) / thermal_config.ambient[1]
  72. dat.deviceState.pmicTempC = [read_tz(z) / thermal_config.pmic[1] for z in thermal_config.pmic[0]]
  73. return dat
  74. def set_offroad_alert_if_changed(offroad_alert: str, show_alert: bool, extra_text: Optional[str]=None):
  75. if prev_offroad_states.get(offroad_alert, None) == (show_alert, extra_text):
  76. return
  77. prev_offroad_states[offroad_alert] = (show_alert, extra_text)
  78. set_offroad_alert(offroad_alert, show_alert, extra_text)
  79. def hw_state_thread(end_event, hw_queue):
  80. """Handles non critical hardware state, and sends over queue"""
  81. count = 0
  82. prev_hw_state = None
  83. modem_version = None
  84. modem_nv = None
  85. modem_configured = False
  86. while not end_event.is_set():
  87. # these are expensive calls. update every 10s
  88. if (count % int(10. / DT_TRML)) == 0:
  89. try:
  90. network_type = HARDWARE.get_network_type()
  91. modem_temps = HARDWARE.get_modem_temperatures()
  92. if len(modem_temps) == 0 and prev_hw_state is not None:
  93. modem_temps = prev_hw_state.modem_temps
  94. # Log modem version once
  95. if AGNOS and ((modem_version is None) or (modem_nv is None)):
  96. modem_version = HARDWARE.get_modem_version() # pylint: disable=assignment-from-none
  97. modem_nv = HARDWARE.get_modem_nv() # pylint: disable=assignment-from-none
  98. if (modem_version is not None) and (modem_nv is not None):
  99. cloudlog.event("modem version", version=modem_version, nv=modem_nv)
  100. tx, rx = HARDWARE.get_modem_data_usage()
  101. hw_state = HardwareState(
  102. network_type=network_type,
  103. network_info=HARDWARE.get_network_info(),
  104. network_strength=HARDWARE.get_network_strength(network_type),
  105. network_stats={'wwanTx': tx, 'wwanRx': rx},
  106. network_metered=HARDWARE.get_network_metered(network_type),
  107. nvme_temps=HARDWARE.get_nvme_temperatures(),
  108. modem_temps=modem_temps,
  109. )
  110. try:
  111. hw_queue.put_nowait(hw_state)
  112. except queue.Full:
  113. pass
  114. # TODO: remove this once the config is in AGNOS
  115. if not modem_configured and len(HARDWARE.get_sim_info().get('sim_id', '')) > 0:
  116. cloudlog.warning("configuring modem")
  117. HARDWARE.configure_modem()
  118. modem_configured = True
  119. prev_hw_state = hw_state
  120. except Exception:
  121. cloudlog.exception("Error getting hardware state")
  122. count += 1
  123. time.sleep(DT_TRML)
  124. def thermald_thread(end_event, hw_queue):
  125. pm = messaging.PubMaster(['deviceState'])
  126. sm = messaging.SubMaster(["peripheralState", "gpsLocationExternal", "controlsState", "pandaStates"], poll=["pandaStates"])
  127. count = 0
  128. onroad_conditions: Dict[str, bool] = {
  129. "ignition": False,
  130. }
  131. startup_conditions: Dict[str, bool] = {}
  132. startup_conditions_prev: Dict[str, bool] = {}
  133. off_ts = None
  134. started_ts = None
  135. started_seen = False
  136. thermal_status = ThermalStatus.green
  137. last_hw_state = HardwareState(
  138. network_type=NetworkType.none,
  139. network_info=None,
  140. network_metered=False,
  141. network_strength=NetworkStrength.unknown,
  142. network_stats={'wwanTx': -1, 'wwanRx': -1},
  143. nvme_temps=[],
  144. modem_temps=[],
  145. )
  146. all_temp_filter = FirstOrderFilter(0., TEMP_TAU, DT_TRML)
  147. offroad_temp_filter = FirstOrderFilter(0., TEMP_TAU, DT_TRML)
  148. should_start_prev = False
  149. in_car = False
  150. engaged_prev = False
  151. params = Params()
  152. power_monitor = PowerMonitoring()
  153. HARDWARE.initialize_hardware()
  154. thermal_config = HARDWARE.get_thermal_config()
  155. fan_controller = None
  156. while not end_event.is_set():
  157. sm.update(PANDA_STATES_TIMEOUT)
  158. pandaStates = sm['pandaStates']
  159. peripheralState = sm['peripheralState']
  160. msg = read_thermal(thermal_config)
  161. if sm.updated['pandaStates'] and len(pandaStates) > 0:
  162. # Set ignition based on any panda connected
  163. onroad_conditions["ignition"] = any(ps.ignitionLine or ps.ignitionCan for ps in pandaStates if ps.pandaType != log.PandaState.PandaType.unknown)
  164. pandaState = pandaStates[0]
  165. in_car = pandaState.harnessStatus != log.PandaState.HarnessStatus.notConnected
  166. # Setup fan handler on first connect to panda
  167. if fan_controller is None and peripheralState.pandaType != log.PandaState.PandaType.unknown:
  168. if TICI:
  169. fan_controller = TiciFanController()
  170. elif (sec_since_boot() - sm.rcv_time['pandaStates']) > DISCONNECT_TIMEOUT:
  171. if onroad_conditions["ignition"]:
  172. onroad_conditions["ignition"] = False
  173. cloudlog.error("panda timed out onroad")
  174. try:
  175. last_hw_state = hw_queue.get_nowait()
  176. except queue.Empty:
  177. pass
  178. msg.deviceState.freeSpacePercent = get_available_percent(default=100.0)
  179. msg.deviceState.memoryUsagePercent = int(round(psutil.virtual_memory().percent))
  180. msg.deviceState.cpuUsagePercent = [int(round(n)) for n in psutil.cpu_percent(percpu=True)]
  181. msg.deviceState.gpuUsagePercent = int(round(HARDWARE.get_gpu_usage_percent()))
  182. msg.deviceState.networkType = last_hw_state.network_type
  183. msg.deviceState.networkMetered = last_hw_state.network_metered
  184. msg.deviceState.networkStrength = last_hw_state.network_strength
  185. msg.deviceState.networkStats = last_hw_state.network_stats
  186. if last_hw_state.network_info is not None:
  187. msg.deviceState.networkInfo = last_hw_state.network_info
  188. msg.deviceState.nvmeTempC = last_hw_state.nvme_temps
  189. msg.deviceState.modemTempC = last_hw_state.modem_temps
  190. msg.deviceState.screenBrightnessPercent = HARDWARE.get_screen_brightness()
  191. # this one is only used for offroad
  192. temp_sources = [
  193. msg.deviceState.memoryTempC,
  194. max(msg.deviceState.cpuTempC),
  195. max(msg.deviceState.gpuTempC),
  196. ]
  197. offroad_comp_temp = offroad_temp_filter.update(max(temp_sources))
  198. # this drives the thermal status while onroad
  199. temp_sources.append(max(msg.deviceState.pmicTempC))
  200. all_comp_temp = all_temp_filter.update(max(temp_sources))
  201. if fan_controller is not None:
  202. msg.deviceState.fanSpeedPercentDesired = fan_controller.update(all_comp_temp, onroad_conditions["ignition"])
  203. is_offroad_for_5_min = (started_ts is None) and ((not started_seen) or (off_ts is None) or (sec_since_boot() - off_ts > 60 * 5))
  204. if is_offroad_for_5_min and offroad_comp_temp > OFFROAD_DANGER_TEMP:
  205. # If device is offroad we want to cool down before going onroad
  206. # since going onroad increases load and can make temps go over 107
  207. thermal_status = ThermalStatus.danger
  208. else:
  209. current_band = THERMAL_BANDS[thermal_status]
  210. band_idx = list(THERMAL_BANDS.keys()).index(thermal_status)
  211. if current_band.min_temp is not None and all_comp_temp < current_band.min_temp:
  212. thermal_status = list(THERMAL_BANDS.keys())[band_idx - 1]
  213. elif current_band.max_temp is not None and all_comp_temp > current_band.max_temp:
  214. thermal_status = list(THERMAL_BANDS.keys())[band_idx + 1]
  215. # **** starting logic ****
  216. # Ensure date/time are valid
  217. now = datetime.datetime.utcnow()
  218. startup_conditions["time_valid"] = (now.year > 2020) or (now.year == 2020 and now.month >= 10)
  219. set_offroad_alert_if_changed("Offroad_InvalidTime", (not startup_conditions["time_valid"]))
  220. startup_conditions["up_to_date"] = params.get("Offroad_ConnectivityNeeded") is None or params.get_bool("DisableUpdates") or params.get_bool("SnoozeUpdate")
  221. startup_conditions["not_uninstalling"] = not params.get_bool("DoUninstall")
  222. startup_conditions["accepted_terms"] = params.get("HasAcceptedTerms") == terms_version
  223. startup_conditions["offroad_min_time"] = (not started_seen) or ((off_ts is not None) and (sec_since_boot() - off_ts) > 5.)
  224. # with 2% left, we killall, otherwise the phone will take a long time to boot
  225. startup_conditions["free_space"] = msg.deviceState.freeSpacePercent > 2
  226. startup_conditions["completed_training"] = params.get("CompletedTrainingVersion") == training_version or \
  227. params.get_bool("Passive")
  228. startup_conditions["not_driver_view"] = not params.get_bool("IsDriverViewEnabled")
  229. startup_conditions["not_taking_snapshot"] = not params.get_bool("IsTakingSnapshot")
  230. # if any CPU gets above 107 or the battery gets above 63, kill all processes
  231. # controls will warn with CPU above 95 or battery above 60
  232. onroad_conditions["device_temp_good"] = thermal_status < ThermalStatus.danger
  233. set_offroad_alert_if_changed("Offroad_TemperatureTooHigh", (not onroad_conditions["device_temp_good"]))
  234. # TODO: this should move to TICI.initialize_hardware, but we currently can't import params there
  235. if TICI:
  236. if not os.path.isfile("/persist/comma/living-in-the-moment"):
  237. if not Path("/data/media").is_mount():
  238. set_offroad_alert_if_changed("Offroad_StorageMissing", True)
  239. else:
  240. # check for bad NVMe
  241. try:
  242. with open("/sys/block/nvme0n1/device/model") as f:
  243. model = f.read().strip()
  244. if not model.startswith("Samsung SSD 980") and params.get("Offroad_BadNvme") is None:
  245. set_offroad_alert_if_changed("Offroad_BadNvme", True)
  246. cloudlog.event("Unsupported NVMe", model=model, error=True)
  247. except Exception:
  248. pass
  249. # Handle offroad/onroad transition
  250. should_start = all(onroad_conditions.values())
  251. if started_ts is None:
  252. should_start = should_start and all(startup_conditions.values())
  253. if should_start != should_start_prev or (count == 0):
  254. params.put_bool("IsOnroad", should_start)
  255. params.put_bool("IsOffroad", not should_start)
  256. params.put_bool("IsEngaged", False)
  257. engaged_prev = False
  258. HARDWARE.set_power_save(not should_start)
  259. if sm.updated['controlsState']:
  260. engaged = sm['controlsState'].enabled
  261. if engaged != engaged_prev:
  262. params.put_bool("IsEngaged", engaged)
  263. engaged_prev = engaged
  264. try:
  265. with open('/dev/kmsg', 'w') as kmsg:
  266. kmsg.write(f"<3>[thermald] engaged: {engaged}\n")
  267. except Exception:
  268. pass
  269. if should_start:
  270. off_ts = None
  271. if started_ts is None:
  272. started_ts = sec_since_boot()
  273. started_seen = True
  274. else:
  275. if onroad_conditions["ignition"] and (startup_conditions != startup_conditions_prev):
  276. cloudlog.event("Startup blocked", startup_conditions=startup_conditions, onroad_conditions=onroad_conditions, error=True)
  277. startup_conditions_prev = startup_conditions.copy()
  278. started_ts = None
  279. if off_ts is None:
  280. off_ts = sec_since_boot()
  281. # Offroad power monitoring
  282. voltage = None if peripheralState.pandaType == log.PandaState.PandaType.unknown else peripheralState.voltage
  283. power_monitor.calculate(voltage, onroad_conditions["ignition"])
  284. msg.deviceState.offroadPowerUsageUwh = power_monitor.get_power_used()
  285. msg.deviceState.carBatteryCapacityUwh = max(0, power_monitor.get_car_battery_capacity())
  286. current_power_draw = HARDWARE.get_current_power_draw()
  287. statlog.sample("power_draw", current_power_draw)
  288. msg.deviceState.powerDrawW = current_power_draw
  289. som_power_draw = HARDWARE.get_som_power_draw()
  290. statlog.sample("som_power_draw", som_power_draw)
  291. msg.deviceState.somPowerDrawW = som_power_draw
  292. # Check if we need to shut down
  293. if power_monitor.should_shutdown(onroad_conditions["ignition"], in_car, off_ts, started_seen):
  294. cloudlog.warning(f"shutting device down, offroad since {off_ts}")
  295. params.put_bool("DoShutdown", True)
  296. msg.deviceState.started = started_ts is not None
  297. msg.deviceState.startedMonoTime = int(1e9*(started_ts or 0))
  298. last_ping = params.get("LastAthenaPingTime")
  299. if last_ping is not None:
  300. msg.deviceState.lastAthenaPingTime = int(last_ping)
  301. msg.deviceState.thermalStatus = thermal_status
  302. pm.send("deviceState", msg)
  303. should_start_prev = should_start
  304. # Log to statsd
  305. statlog.gauge("free_space_percent", msg.deviceState.freeSpacePercent)
  306. statlog.gauge("gpu_usage_percent", msg.deviceState.gpuUsagePercent)
  307. statlog.gauge("memory_usage_percent", msg.deviceState.memoryUsagePercent)
  308. for i, usage in enumerate(msg.deviceState.cpuUsagePercent):
  309. statlog.gauge(f"cpu{i}_usage_percent", usage)
  310. for i, temp in enumerate(msg.deviceState.cpuTempC):
  311. statlog.gauge(f"cpu{i}_temperature", temp)
  312. for i, temp in enumerate(msg.deviceState.gpuTempC):
  313. statlog.gauge(f"gpu{i}_temperature", temp)
  314. statlog.gauge("memory_temperature", msg.deviceState.memoryTempC)
  315. statlog.gauge("ambient_temperature", msg.deviceState.ambientTempC)
  316. for i, temp in enumerate(msg.deviceState.pmicTempC):
  317. statlog.gauge(f"pmic{i}_temperature", temp)
  318. for i, temp in enumerate(last_hw_state.nvme_temps):
  319. statlog.gauge(f"nvme_temperature{i}", temp)
  320. for i, temp in enumerate(last_hw_state.modem_temps):
  321. statlog.gauge(f"modem_temperature{i}", temp)
  322. statlog.gauge("fan_speed_percent_desired", msg.deviceState.fanSpeedPercentDesired)
  323. statlog.gauge("screen_brightness_percent", msg.deviceState.screenBrightnessPercent)
  324. # report to server once every 10 minutes
  325. if (count % int(600. / DT_TRML)) == 0:
  326. cloudlog.event("STATUS_PACKET",
  327. count=count,
  328. pandaStates=[strip_deprecated_keys(p.to_dict()) for p in pandaStates],
  329. peripheralState=strip_deprecated_keys(peripheralState.to_dict()),
  330. location=(strip_deprecated_keys(sm["gpsLocationExternal"].to_dict()) if sm.alive["gpsLocationExternal"] else None),
  331. deviceState=strip_deprecated_keys(msg.to_dict()))
  332. count += 1
  333. def main():
  334. hw_queue = queue.Queue(maxsize=1)
  335. end_event = threading.Event()
  336. threads = [
  337. threading.Thread(target=hw_state_thread, args=(end_event, hw_queue)),
  338. threading.Thread(target=thermald_thread, args=(end_event, hw_queue)),
  339. ]
  340. for t in threads:
  341. t.start()
  342. try:
  343. while True:
  344. time.sleep(1)
  345. if not all(t.is_alive() for t in threads):
  346. break
  347. finally:
  348. end_event.set()
  349. for t in threads:
  350. t.join()
  351. if __name__ == "__main__":
  352. main()