log_aggregator.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. import re
  2. from typing import List
  3. TRACEBACK_PATTERN = "Traceback (most recent call last)"
  4. class LogAggregator:
  5. def __init__(self, log: str):
  6. self.log = log
  7. def compute_crash_pattern(self) -> str:
  8. stack_trace = LogAggregator._compute_stack_trace(self.log.splitlines())
  9. # truncate short enough to store in databases, but long enough to keep the
  10. # pattern unique
  11. return LogAggregator._compute_signature(stack_trace)[:4000]
  12. @staticmethod
  13. def _compute_signature(stack_trace: List[str]) -> str:
  14. """
  15. Compute signature pattern from stack trace, by remove factors such as date,
  16. time, temp directory, line numbers, etc. This help to aggregate similar logs
  17. into same bug patterns
  18. """
  19. massaged_trace = []
  20. for line in stack_trace:
  21. # remove any hashes that are more than 10 characters
  22. line = re.sub(r"[a-z0-9]{10,}", "", line.strip())
  23. # remove any numbers
  24. line = re.sub(r"\d", "", line)
  25. if line == "Traceback (most recent call last):":
  26. continue
  27. file_line = re.search(r'File "(.*)", (.*)', line)
  28. if file_line:
  29. # append the file's base name and caller information; the result string
  30. # is not something meaningful to human, we just need something that
  31. # uniquely represent the stack trace
  32. line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}'
  33. massaged_trace.append(line)
  34. return "".join(massaged_trace)
  35. @staticmethod
  36. def _compute_stack_trace(logs: List[str]) -> List[str]:
  37. """
  38. Extract stack trace pattern from the logs. Stack trace pattern often matches
  39. the following:
  40. ERROR ...
  41. Traceback (most recent call last):
  42. File "...", line ..., in ...
  43. ...
  44. Exception: exception error
  45. """
  46. error_stacktrace = []
  47. stacktrace = []
  48. i = 0
  49. while i < len(logs):
  50. stack = []
  51. trace = error_stacktrace
  52. # Search for lines that are either
  53. # ... ERROR ...
  54. # or
  55. # ... ERROR ...
  56. # Traceback (most recent call last):
  57. if "ERROR" in logs[i]:
  58. stack.append(logs[i])
  59. next = i + 1
  60. if i + 1 < len(logs) and TRACEBACK_PATTERN in logs[i + 1]:
  61. stack.append(logs[i + 1])
  62. next = i + 2
  63. # Or if the line with ERROR does not exist, just search for the line with
  64. # Traceback (most recent call last):
  65. elif TRACEBACK_PATTERN in logs[i]:
  66. stack.append(logs[i])
  67. trace = stacktrace
  68. next = i + 1
  69. # Or else, skip this line and continue
  70. else:
  71. i = i + 1
  72. continue
  73. # If the line that contains ERROR, Traceback, etc. is found, scan the logs
  74. # until the line no longer has indentation. This is because stack trace
  75. # is always indented, and stops when the line is no longer indented
  76. while next < len(logs):
  77. if logs[next].startswith((" ", "\t")):
  78. stack.append(logs[next])
  79. next = next + 1
  80. else:
  81. break
  82. # Finished capturing the entire stack trace
  83. if next < len(logs):
  84. stack.append(logs[next])
  85. if stack:
  86. trace.append(stack)
  87. i = next + 1
  88. # Favor stack trace that contains the ERROR keyword
  89. if error_stacktrace:
  90. return error_stacktrace[-1]
  91. # Otherwise any stack trace is fine
  92. if stacktrace:
  93. return stacktrace[-1]
  94. return []