tombstoned.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. #!/usr/bin/env python3
  2. import datetime
  3. import os
  4. import re
  5. import shutil
  6. import signal
  7. import subprocess
  8. import time
  9. import glob
  10. from typing import NoReturn
  11. from common.file_helpers import mkdirs_exists_ok
  12. from selfdrive.loggerd.config import ROOT
  13. import selfdrive.sentry as sentry
  14. from system.swaglog import cloudlog
  15. from system.version import get_commit
  16. MAX_SIZE = 1_000_000 * 100 # allow up to 100M
  17. MAX_TOMBSTONE_FN_LEN = 62 # 85 - 23 ("<dongle id>/crash/")
  18. TOMBSTONE_DIR = "/data/tombstones/"
  19. APPORT_DIR = "/var/crash/"
  20. def safe_fn(s):
  21. extra = ['_']
  22. return "".join(c for c in s if c.isalnum() or c in extra).rstrip()
  23. def clear_apport_folder():
  24. for f in glob.glob(APPORT_DIR + '*'):
  25. try:
  26. os.remove(f)
  27. except Exception:
  28. pass
  29. def get_apport_stacktrace(fn):
  30. try:
  31. cmd = f'apport-retrace -s <(cat <(echo "Package: openpilot") "{fn}")'
  32. return subprocess.check_output(cmd, shell=True, encoding='utf8', timeout=30, executable='/bin/bash') # pylint: disable=unexpected-keyword-arg
  33. except subprocess.CalledProcessError:
  34. return "Error getting stacktrace"
  35. except subprocess.TimeoutExpired:
  36. return "Timeout getting stacktrace"
  37. def get_tombstones():
  38. """Returns list of (filename, ctime) for all tombstones in /data/tombstones
  39. and apport crashlogs in /var/crash"""
  40. files = []
  41. for folder in [TOMBSTONE_DIR, APPORT_DIR]:
  42. if os.path.exists(folder):
  43. with os.scandir(folder) as d:
  44. # Loop over first 1000 directory entries
  45. for _, f in zip(range(1000), d):
  46. if f.name.startswith("tombstone"):
  47. files.append((f.path, int(f.stat().st_ctime)))
  48. elif f.name.endswith(".crash") and f.stat().st_mode == 0o100640:
  49. files.append((f.path, int(f.stat().st_ctime)))
  50. return files
  51. def report_tombstone_apport(fn):
  52. f_size = os.path.getsize(fn)
  53. if f_size > MAX_SIZE:
  54. cloudlog.error(f"Tombstone {fn} too big, {f_size}. Skipping...")
  55. return
  56. message = "" # One line description of the crash
  57. contents = "" # Full file contents without coredump
  58. path = "" # File path relative to openpilot directory
  59. proc_maps = False
  60. with open(fn) as f:
  61. for line in f:
  62. if "CoreDump" in line:
  63. break
  64. elif "ProcMaps" in line:
  65. proc_maps = True
  66. elif "ProcStatus" in line:
  67. proc_maps = False
  68. if not proc_maps:
  69. contents += line
  70. if "ExecutablePath" in line:
  71. path = line.strip().split(': ')[-1]
  72. path = path.replace('/data/openpilot/', '')
  73. message += path
  74. elif "Signal" in line:
  75. message += " - " + line.strip()
  76. try:
  77. sig_num = int(line.strip().split(': ')[-1])
  78. message += " (" + signal.Signals(sig_num).name + ")" # pylint: disable=no-member
  79. except ValueError:
  80. pass
  81. stacktrace = get_apport_stacktrace(fn)
  82. stacktrace_s = stacktrace.split('\n')
  83. crash_function = "No stacktrace"
  84. if len(stacktrace_s) > 2:
  85. found = False
  86. # Try to find first entry in openpilot, fall back to first line
  87. for line in stacktrace_s:
  88. if "at selfdrive/" in line:
  89. crash_function = line
  90. found = True
  91. break
  92. if not found:
  93. crash_function = stacktrace_s[1]
  94. # Remove arguments that can contain pointers to make sentry one-liner unique
  95. crash_function = " ".join(x for x in crash_function.split(' ')[1:] if not x.startswith('0x'))
  96. crash_function = re.sub(r'\(.*?\)', '', crash_function)
  97. contents = stacktrace + "\n\n" + contents
  98. message = message + " - " + crash_function
  99. sentry.report_tombstone(fn, message, contents)
  100. # Copy crashlog to upload folder
  101. clean_path = path.replace('/', '_')
  102. date = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
  103. new_fn = f"{date}_{get_commit(default='nocommit')[:8]}_{safe_fn(clean_path)}"[:MAX_TOMBSTONE_FN_LEN]
  104. crashlog_dir = os.path.join(ROOT, "crash")
  105. mkdirs_exists_ok(crashlog_dir)
  106. # Files could be on different filesystems, copy, then delete
  107. shutil.copy(fn, os.path.join(crashlog_dir, new_fn))
  108. try:
  109. os.remove(fn)
  110. except PermissionError:
  111. pass
  112. def main() -> NoReturn:
  113. sentry.init(sentry.SentryProject.SELFDRIVE_NATIVE)
  114. # Clear apport folder on start, otherwise duplicate crashes won't register
  115. clear_apport_folder()
  116. initial_tombstones = set(get_tombstones())
  117. while True:
  118. now_tombstones = set(get_tombstones())
  119. for fn, _ in (now_tombstones - initial_tombstones):
  120. try:
  121. cloudlog.info(f"reporting new tombstone {fn}")
  122. if fn.endswith(".crash"):
  123. report_tombstone_apport(fn)
  124. else:
  125. cloudlog.error(f"unknown crash type: {fn}")
  126. except Exception:
  127. cloudlog.exception(f"Error reporting tombstone {fn}")
  128. initial_tombstones = now_tombstones
  129. time.sleep(5)
  130. if __name__ == "__main__":
  131. main()