ci_state_machine.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. from typing import List
  2. from ray_release.test_automation.state_machine import (
  3. TestStateMachine,
  4. WEEKLY_RELEASE_BLOCKER_TAG,
  5. )
  6. from ray_release.test import Test, TestState, TestResult
  7. CONTINUOUS_FAILURE_TO_FLAKY = 3 # Number of continuous failures before flaky
  8. CONTINUOUS_PASSING_TO_PASSING = 10 # Number of continuous passing before passing
  9. FLAKY_PERCENTAGE_THRESHOLD = 7 # Percentage threshold to be considered as flaky
  10. FAILING_TO_FLAKY_MESSAGE = (
  11. "This test is now considered as flaky because it has been "
  12. "failing on postmerge for too long. Flaky tests do not run on premerge."
  13. )
  14. JAILED_MESSAGE = (
  15. "This test is confirmed to be jailed because of the presence of the jailed tag. "
  16. "Jailed tests are no longer release blocking. To unjail this test, close this "
  17. "issue."
  18. )
  19. JAILED_TAG = "jailed-test"
  20. MAX_REPORT_FAILURE_NO = 5
  21. class CITestStateMachine(TestStateMachine):
  22. def __init__(self, test: Test, dry_run: bool = False) -> None:
  23. # Need long enough test history to detect flaky tests
  24. super().__init__(test, dry_run=dry_run, history_length=30)
  25. def _move_hook(self, from_state: TestState, to_state: TestState) -> None:
  26. change = (from_state, to_state)
  27. if change == (TestState.PASSING, TestState.CONSITENTLY_FAILING):
  28. self._create_github_issue()
  29. self._trigger_bisect()
  30. elif change == (TestState.FAILING, TestState.CONSITENTLY_FAILING):
  31. self._create_github_issue()
  32. self._trigger_bisect()
  33. elif change == (TestState.CONSITENTLY_FAILING, TestState.PASSING):
  34. self._close_github_issue()
  35. elif change == (TestState.CONSITENTLY_FAILING, TestState.FLAKY):
  36. self._comment_github_issue(FAILING_TO_FLAKY_MESSAGE)
  37. elif change == (TestState.PASSING, TestState.FLAKY):
  38. self._create_github_issue()
  39. elif change == (TestState.FLAKY, TestState.PASSING):
  40. self._close_github_issue()
  41. elif change == (TestState.FLAKY, TestState.JAILED):
  42. self._comment_github_issue(JAILED_MESSAGE)
  43. def _state_hook(self, _: TestState) -> None:
  44. pass
  45. def _comment_github_issue(self, comment: str) -> bool:
  46. github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  47. if not github_issue_number:
  48. return False
  49. issue = self.ray_repo.get_issue(github_issue_number)
  50. issue.create_comment(comment)
  51. return True
  52. def _create_github_issue(self) -> None:
  53. labels = [
  54. "bug",
  55. "ci-test",
  56. "ray-test-bot",
  57. "flaky-tracker",
  58. "stability",
  59. "triage",
  60. self.test.get_oncall(),
  61. WEEKLY_RELEASE_BLOCKER_TAG,
  62. ]
  63. recent_failures = [
  64. result for result in self.test_results if result.is_failing()
  65. ][:MAX_REPORT_FAILURE_NO]
  66. body = (
  67. f"CI test **{self.test.get_name()}** is {self.test.get_state().value}. "
  68. "Recent failures: \n"
  69. )
  70. for failure in recent_failures:
  71. body += f"\t- {failure.url}\n"
  72. # This line is to match the regex in https://shorturl.at/aiK25
  73. body += f"\nDataCaseName-{self.test.get_name()}-END\n"
  74. body += "Managed by OSS Test Policy"
  75. title = f"CI test {self.test.get_name()} is {self.test.get_state().value}"
  76. # If the issue already exists, update the issue; otherwise creating a new one
  77. github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  78. if not github_issue_number:
  79. issue_number = self.ray_repo.create_issue(
  80. title=title,
  81. body=body,
  82. labels=labels,
  83. ).number
  84. self.test[Test.KEY_GITHUB_ISSUE_NUMBER] = issue_number
  85. return
  86. else:
  87. issue = self.ray_repo.get_issue(github_issue_number)
  88. issue.edit(title=title, state="open")
  89. issue.create_comment(body)
  90. return
  91. def _consistently_failing_to_jailed(self) -> bool:
  92. return False
  93. def _passing_to_flaky(self) -> bool:
  94. # A test is flaky if it has been changing from passing to failing for
  95. # a certain percentage of time in the test history. However, if it has been
  96. # recently stable, then it is not flaky.
  97. if self._is_recently_stable():
  98. return False
  99. return self.is_flaky_result_history(self.test_results)
  100. @staticmethod
  101. def is_flaky_result_history(results: List[TestResult]):
  102. transition = 0
  103. for i in range(0, len(results) - 1):
  104. if results[i].is_failing() and results[i + 1].is_passing():
  105. transition += 1
  106. if transition >= FLAKY_PERCENTAGE_THRESHOLD * len(results) / 100:
  107. return True
  108. return False
  109. def _consistently_failing_to_flaky(self) -> bool:
  110. return len(self.test_results) >= CONTINUOUS_FAILURE_TO_FLAKY and all(
  111. result.is_failing()
  112. for result in self.test_results[:CONTINUOUS_FAILURE_TO_FLAKY]
  113. )
  114. def _flaky_to_passing(self) -> bool:
  115. # A flaky test is considered passing if it has been passing for a certain
  116. # period and the github issue is closed (by a human).
  117. return self._is_recently_stable()
  118. def _is_recently_stable(self) -> bool:
  119. return len(self.test_results) >= CONTINUOUS_PASSING_TO_PASSING and all(
  120. result.is_passing()
  121. for result in self.test_results[:CONTINUOUS_PASSING_TO_PASSING]
  122. )
  123. def _flaky_to_jailed(self) -> bool:
  124. # If a human has confirmed that this test is jailed (by adding the jailed tag),
  125. # then it is jailed
  126. github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  127. if not github_issue_number:
  128. return False
  129. issue = self.ray_repo.get_issue(github_issue_number)
  130. return JAILED_TAG in [label.name for label in issue.get_labels()]