state_machine.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. import abc
  2. from typing import List
  3. from datetime import datetime, timedelta
  4. import github
  5. from github import Github
  6. from pybuildkite.buildkite import Buildkite
  7. from ray_release.test import (
  8. Test,
  9. TestState,
  10. )
  11. from ray_release.aws import get_secret_token
  12. from ray_release.logger import logger
  13. RAY_REPO = "ray-project/ray"
  14. BUILDKITE_ORGANIZATION = "ray-project"
  15. BUILDKITE_BISECT_PIPELINE = "release-tests-bisect"
  16. AWS_SECRET_GITHUB = "ray_ci_github_token"
  17. AWS_SECRET_BUILDKITE = "ray_ci_buildkite_token"
  18. WEEKLY_RELEASE_BLOCKER_TAG = "weekly-release-blocker"
  19. NO_TEAM = "none"
  20. TEAM = [
  21. "core",
  22. "data",
  23. "kuberay",
  24. "ml",
  25. "rllib",
  26. "serve",
  27. "serverless",
  28. ]
  29. MAX_BISECT_PER_DAY = 10 # Max number of bisects to run per day for all tests
  30. class TestStateMachine(abc.ABC):
  31. """
  32. State machine that computes the next state of a test based on the current state and
  33. perform actions accordingly during the state transition. For example:
  34. - passing -[two last results failed]-> failing: create github issue
  35. - failing -[last result passed]-> passing: close github issue
  36. - jailed -[latest result passed]-> passing: update the test's oncall
  37. ...
  38. """
  39. ray_repo = None
  40. ray_buildkite = None
  41. def __init__(
  42. self, test: Test, history_length: int = 10, dry_run: bool = False
  43. ) -> None:
  44. self.test = test
  45. self.test_results = test.get_test_results(limit=history_length)
  46. self.dry_run = dry_run
  47. TestStateMachine._init_ray_repo()
  48. TestStateMachine._init_ray_buildkite()
  49. @classmethod
  50. def _init_ray_repo(cls):
  51. if not cls.ray_repo:
  52. cls.ray_repo = cls.get_github().get_repo(RAY_REPO)
  53. @classmethod
  54. def get_github(cls):
  55. return Github(get_secret_token(AWS_SECRET_GITHUB))
  56. @classmethod
  57. def get_ray_repo(cls):
  58. cls._init_ray_repo()
  59. return cls.ray_repo
  60. @classmethod
  61. def _init_ray_buildkite(cls):
  62. if not cls.ray_buildkite:
  63. buildkite_token = get_secret_token(AWS_SECRET_BUILDKITE)
  64. cls.ray_buildkite = Buildkite()
  65. cls.ray_buildkite.set_access_token(buildkite_token)
  66. @classmethod
  67. def get_release_blockers(cls) -> List[github.Issue.Issue]:
  68. repo = cls.get_ray_repo()
  69. blocker_label = repo.get_label(WEEKLY_RELEASE_BLOCKER_TAG)
  70. return list(repo.get_issues(state="open", labels=[blocker_label]))
  71. @classmethod
  72. def get_issue_owner(cls, issue: github.Issue.Issue) -> str:
  73. labels = issue.get_labels()
  74. for label in labels:
  75. if label.name in TEAM:
  76. return label.name
  77. return NO_TEAM
  78. def move(self) -> None:
  79. """
  80. Move the test to the next state.
  81. """
  82. if not self.test_results:
  83. # No result to move the state
  84. return
  85. from_state = self.test.get_state()
  86. to_state = self._next_state(from_state)
  87. self.test.set_state(to_state)
  88. if self.dry_run:
  89. # Don't perform any action if dry run
  90. return
  91. self._move_hook(from_state, to_state)
  92. self._state_hook(to_state)
  93. def _next_state(self, current_state) -> TestState:
  94. """
  95. Compute the next state of the test based on the current state and the test
  96. """
  97. if current_state == TestState.PASSING:
  98. if self._passing_to_consistently_failing():
  99. return TestState.CONSITENTLY_FAILING
  100. if self._passing_to_failing():
  101. return TestState.FAILING
  102. if self._passing_to_flaky():
  103. return TestState.FLAKY
  104. if current_state == TestState.FAILING:
  105. if self._failing_to_consistently_failing():
  106. return TestState.CONSITENTLY_FAILING
  107. if self._failing_to_passing():
  108. return TestState.PASSING
  109. if current_state == TestState.CONSITENTLY_FAILING:
  110. if self._consistently_failing_to_jailed():
  111. return TestState.JAILED
  112. if self._consistently_failing_to_passing():
  113. return TestState.PASSING
  114. if self._consistently_failing_to_flaky():
  115. return TestState.FLAKY
  116. if current_state == TestState.FLAKY:
  117. if self._flaky_to_passing():
  118. return TestState.PASSING
  119. if self._flaky_to_jailed():
  120. return TestState.JAILED
  121. if current_state == TestState.JAILED:
  122. if self._jailed_to_passing():
  123. return TestState.PASSING
  124. return current_state
  125. def _jailed_to_passing(self) -> bool:
  126. return len(self.test_results) > 0 and self.test_results[0].is_passing()
  127. def _passing_to_failing(self) -> bool:
  128. return (
  129. len(self.test_results) > 0
  130. and self.test_results[0].is_failing()
  131. and not self._passing_to_consistently_failing()
  132. )
  133. def _passing_to_consistently_failing(self) -> bool:
  134. return (
  135. len(self.test_results) > 1
  136. and self.test_results[0].is_failing()
  137. and self.test_results[1].is_failing()
  138. )
  139. def _failing_to_passing(self) -> bool:
  140. return len(self.test_results) > 0 and self.test_results[0].is_passing()
  141. def _failing_to_consistently_failing(self) -> bool:
  142. return self._passing_to_consistently_failing() or self.test.get(
  143. Test.KEY_BISECT_BLAMED_COMMIT
  144. )
  145. def _consistently_failing_to_passing(self) -> bool:
  146. return self._failing_to_passing()
  147. """
  148. Abstract methods
  149. """
  150. @abc.abstractmethod
  151. def _move_hook(self, from_state: TestState, to_state: TestState) -> None:
  152. """
  153. Action performed when test transitions to a different state. This is where we do
  154. things like creating and closing github issues, trigger bisects, etc.
  155. """
  156. pass
  157. @abc.abstractmethod
  158. def _state_hook(self, state: TestState) -> None:
  159. """
  160. Action performed when test is in a particular state. This is where we do things
  161. to keep an invariant for a state. For example, we can keep the github issue open
  162. if the test is failing.
  163. """
  164. pass
  165. @abc.abstractmethod
  166. def _consistently_failing_to_jailed(self) -> bool:
  167. """
  168. Condition to jail a test. This is an abstract method since different state
  169. machine implements this logic differently.
  170. """
  171. pass
  172. @abc.abstractmethod
  173. def _passing_to_flaky(self) -> bool:
  174. pass
  175. @abc.abstractmethod
  176. def _consistently_failing_to_flaky(self) -> bool:
  177. pass
  178. @abc.abstractmethod
  179. def _flaky_to_passing(self) -> bool:
  180. pass
  181. @abc.abstractmethod
  182. def _flaky_to_jailed(self) -> bool:
  183. pass
  184. """
  185. Common helper methods
  186. """
  187. def _jail_test(self) -> None:
  188. """
  189. Notify github issue owner that the test is jailed
  190. """
  191. github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  192. if not github_issue_number:
  193. return
  194. issue = self.ray_repo.get_issue(github_issue_number)
  195. issue.create_comment("Test has been failing for far too long. Jailing.")
  196. labels = ["jailed-test"] + [label.name for label in issue.get_labels()]
  197. issue.edit(labels=labels)
  198. def _close_github_issue(self) -> None:
  199. github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  200. if not github_issue_number:
  201. return
  202. issue = self.ray_repo.get_issue(github_issue_number)
  203. issue.create_comment(f"Test passed on latest run: {self.test_results[0].url}")
  204. issue.edit(state="closed")
  205. def _keep_github_issue_open(self) -> None:
  206. github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  207. if not github_issue_number:
  208. return
  209. issue = self.ray_repo.get_issue(github_issue_number)
  210. if issue.state == "open":
  211. return
  212. issue.edit(state="open")
  213. issue.create_comment(
  214. "Re-opening issue as test is still failing. "
  215. f"Latest run: {self.test_results[0].url}"
  216. )
  217. def comment_blamed_commit_on_github_issue(self) -> bool:
  218. """
  219. Comment the blamed commit on the github issue.
  220. Returns: True if the comment is made, False otherwise
  221. """
  222. blamed_commit = self.test.get(Test.KEY_BISECT_BLAMED_COMMIT)
  223. issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
  224. bisect_build_number = self.test.get(Test.KEY_BISECT_BUILD_NUMBER)
  225. if not issue_number or not bisect_build_number or not blamed_commit:
  226. logger.info(
  227. "Skip commenting blamed commit on github issue "
  228. f"for {self.test.get_name()}. The following fields should be set: "
  229. f" blamed_commit={blamed_commit}, issue_number={issue_number}, "
  230. f" bisect_build_number={bisect_build_number}"
  231. )
  232. return False
  233. issue = self.ray_repo.get_issue(issue_number)
  234. issue.create_comment(
  235. f"Blamed commit: {blamed_commit} "
  236. f"found by bisect job https://buildkite.com/{BUILDKITE_ORGANIZATION}/"
  237. f"{BUILDKITE_BISECT_PIPELINE}/builds/{bisect_build_number}"
  238. )
  239. return True
  240. def _trigger_bisect(self) -> None:
  241. if self._bisect_rate_limit_exceeded():
  242. logger.info(f"Skip bisect {self.test.get_name()} due to rate limit")
  243. return
  244. test_type = self.test.get_test_type().value
  245. build = self.ray_buildkite.builds().create_build(
  246. BUILDKITE_ORGANIZATION,
  247. BUILDKITE_BISECT_PIPELINE,
  248. "HEAD",
  249. "master",
  250. message=f"[ray-test-bot] {self.test.get_name()} failing",
  251. env={
  252. "UPDATE_TEST_STATE_MACHINE": "1",
  253. "RAYCI_TEST_TYPE": test_type,
  254. },
  255. )
  256. failing_commit = self.test_results[0].commit
  257. passing_commits = [r.commit for r in self.test_results if r.is_passing()]
  258. if not passing_commits:
  259. logger.info(f"Skip bisect {self.test.get_name()} due to no passing commit")
  260. return
  261. passing_commit = passing_commits[0]
  262. self.ray_buildkite.jobs().unblock_job(
  263. BUILDKITE_ORGANIZATION,
  264. BUILDKITE_BISECT_PIPELINE,
  265. build["number"],
  266. build["jobs"][0]["id"], # first job is the blocked job
  267. fields={
  268. "test-name": self.test.get_name(),
  269. "passing-commit": passing_commit,
  270. "failing-commit": failing_commit,
  271. "concurrency": "3",
  272. "run-per-commit": "1",
  273. "test-type": test_type,
  274. },
  275. )
  276. self.test[Test.KEY_BISECT_BUILD_NUMBER] = build["number"]
  277. def _bisect_rate_limit_exceeded(self) -> bool:
  278. """
  279. Check if we have exceeded the rate limit of bisects per day.
  280. """
  281. builds = self.ray_buildkite.builds().list_all_for_pipeline(
  282. BUILDKITE_ORGANIZATION,
  283. BUILDKITE_BISECT_PIPELINE,
  284. created_from=datetime.now() - timedelta(days=1),
  285. branch="master",
  286. )
  287. builds = [
  288. build
  289. for build in builds
  290. if build["env"].get("RAYCI_TEST_TYPE") == self.test.get_test_type().value
  291. ]
  292. return len(builds) >= self.test.get_bisect_daily_rate_limit()