123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- import abc
- from typing import List
- from datetime import datetime, timedelta
- import github
- from github import Github
- from pybuildkite.buildkite import Buildkite
- from ray_release.test import (
- Test,
- TestState,
- )
- from ray_release.aws import get_secret_token
- from ray_release.logger import logger
- RAY_REPO = "ray-project/ray"
- BUILDKITE_ORGANIZATION = "ray-project"
- BUILDKITE_BISECT_PIPELINE = "release-tests-bisect"
- AWS_SECRET_GITHUB = "ray_ci_github_token"
- AWS_SECRET_BUILDKITE = "ray_ci_buildkite_token"
- WEEKLY_RELEASE_BLOCKER_TAG = "weekly-release-blocker"
- NO_TEAM = "none"
- TEAM = [
- "core",
- "data",
- "kuberay",
- "ml",
- "rllib",
- "serve",
- "serverless",
- ]
- MAX_BISECT_PER_DAY = 10 # Max number of bisects to run per day for all tests
- class TestStateMachine(abc.ABC):
- """
- State machine that computes the next state of a test based on the current state and
- perform actions accordingly during the state transition. For example:
- - passing -[two last results failed]-> failing: create github issue
- - failing -[last result passed]-> passing: close github issue
- - jailed -[latest result passed]-> passing: update the test's oncall
- ...
- """
- ray_repo = None
- ray_buildkite = None
- def __init__(
- self, test: Test, history_length: int = 10, dry_run: bool = False
- ) -> None:
- self.test = test
- self.test_results = test.get_test_results(limit=history_length)
- self.dry_run = dry_run
- TestStateMachine._init_ray_repo()
- TestStateMachine._init_ray_buildkite()
- @classmethod
- def _init_ray_repo(cls):
- if not cls.ray_repo:
- cls.ray_repo = cls.get_github().get_repo(RAY_REPO)
- @classmethod
- def get_github(cls):
- return Github(get_secret_token(AWS_SECRET_GITHUB))
- @classmethod
- def get_ray_repo(cls):
- cls._init_ray_repo()
- return cls.ray_repo
- @classmethod
- def _init_ray_buildkite(cls):
- if not cls.ray_buildkite:
- buildkite_token = get_secret_token(AWS_SECRET_BUILDKITE)
- cls.ray_buildkite = Buildkite()
- cls.ray_buildkite.set_access_token(buildkite_token)
- @classmethod
- def get_release_blockers(cls) -> List[github.Issue.Issue]:
- repo = cls.get_ray_repo()
- blocker_label = repo.get_label(WEEKLY_RELEASE_BLOCKER_TAG)
- return list(repo.get_issues(state="open", labels=[blocker_label]))
- @classmethod
- def get_issue_owner(cls, issue: github.Issue.Issue) -> str:
- labels = issue.get_labels()
- for label in labels:
- if label.name in TEAM:
- return label.name
- return NO_TEAM
- def move(self) -> None:
- """
- Move the test to the next state.
- """
- if not self.test_results:
- # No result to move the state
- return
- from_state = self.test.get_state()
- to_state = self._next_state(from_state)
- self.test.set_state(to_state)
- if self.dry_run:
- # Don't perform any action if dry run
- return
- self._move_hook(from_state, to_state)
- self._state_hook(to_state)
- def _next_state(self, current_state) -> TestState:
- """
- Compute the next state of the test based on the current state and the test
- """
- if current_state == TestState.PASSING:
- if self._passing_to_consistently_failing():
- return TestState.CONSITENTLY_FAILING
- if self._passing_to_failing():
- return TestState.FAILING
- if self._passing_to_flaky():
- return TestState.FLAKY
- if current_state == TestState.FAILING:
- if self._failing_to_consistently_failing():
- return TestState.CONSITENTLY_FAILING
- if self._failing_to_passing():
- return TestState.PASSING
- if current_state == TestState.CONSITENTLY_FAILING:
- if self._consistently_failing_to_jailed():
- return TestState.JAILED
- if self._consistently_failing_to_passing():
- return TestState.PASSING
- if self._consistently_failing_to_flaky():
- return TestState.FLAKY
- if current_state == TestState.FLAKY:
- if self._flaky_to_passing():
- return TestState.PASSING
- if self._flaky_to_jailed():
- return TestState.JAILED
- if current_state == TestState.JAILED:
- if self._jailed_to_passing():
- return TestState.PASSING
- return current_state
- def _jailed_to_passing(self) -> bool:
- return len(self.test_results) > 0 and self.test_results[0].is_passing()
- def _passing_to_failing(self) -> bool:
- return (
- len(self.test_results) > 0
- and self.test_results[0].is_failing()
- and not self._passing_to_consistently_failing()
- )
- def _passing_to_consistently_failing(self) -> bool:
- return (
- len(self.test_results) > 1
- and self.test_results[0].is_failing()
- and self.test_results[1].is_failing()
- )
- def _failing_to_passing(self) -> bool:
- return len(self.test_results) > 0 and self.test_results[0].is_passing()
- def _failing_to_consistently_failing(self) -> bool:
- return self._passing_to_consistently_failing() or self.test.get(
- Test.KEY_BISECT_BLAMED_COMMIT
- )
- def _consistently_failing_to_passing(self) -> bool:
- return self._failing_to_passing()
- """
- Abstract methods
- """
- @abc.abstractmethod
- def _move_hook(self, from_state: TestState, to_state: TestState) -> None:
- """
- Action performed when test transitions to a different state. This is where we do
- things like creating and closing github issues, trigger bisects, etc.
- """
- pass
- @abc.abstractmethod
- def _state_hook(self, state: TestState) -> None:
- """
- Action performed when test is in a particular state. This is where we do things
- to keep an invariant for a state. For example, we can keep the github issue open
- if the test is failing.
- """
- pass
- @abc.abstractmethod
- def _consistently_failing_to_jailed(self) -> bool:
- """
- Condition to jail a test. This is an abstract method since different state
- machine implements this logic differently.
- """
- pass
- @abc.abstractmethod
- def _passing_to_flaky(self) -> bool:
- pass
- @abc.abstractmethod
- def _consistently_failing_to_flaky(self) -> bool:
- pass
- @abc.abstractmethod
- def _flaky_to_passing(self) -> bool:
- pass
- @abc.abstractmethod
- def _flaky_to_jailed(self) -> bool:
- pass
- """
- Common helper methods
- """
- def _jail_test(self) -> None:
- """
- Notify github issue owner that the test is jailed
- """
- github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
- if not github_issue_number:
- return
- issue = self.ray_repo.get_issue(github_issue_number)
- issue.create_comment("Test has been failing for far too long. Jailing.")
- labels = ["jailed-test"] + [label.name for label in issue.get_labels()]
- issue.edit(labels=labels)
- def _close_github_issue(self) -> None:
- github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
- if not github_issue_number:
- return
- issue = self.ray_repo.get_issue(github_issue_number)
- issue.create_comment(f"Test passed on latest run: {self.test_results[0].url}")
- issue.edit(state="closed")
- def _keep_github_issue_open(self) -> None:
- github_issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
- if not github_issue_number:
- return
- issue = self.ray_repo.get_issue(github_issue_number)
- if issue.state == "open":
- return
- issue.edit(state="open")
- issue.create_comment(
- "Re-opening issue as test is still failing. "
- f"Latest run: {self.test_results[0].url}"
- )
- def comment_blamed_commit_on_github_issue(self) -> bool:
- """
- Comment the blamed commit on the github issue.
- Returns: True if the comment is made, False otherwise
- """
- blamed_commit = self.test.get(Test.KEY_BISECT_BLAMED_COMMIT)
- issue_number = self.test.get(Test.KEY_GITHUB_ISSUE_NUMBER)
- bisect_build_number = self.test.get(Test.KEY_BISECT_BUILD_NUMBER)
- if not issue_number or not bisect_build_number or not blamed_commit:
- logger.info(
- "Skip commenting blamed commit on github issue "
- f"for {self.test.get_name()}. The following fields should be set: "
- f" blamed_commit={blamed_commit}, issue_number={issue_number}, "
- f" bisect_build_number={bisect_build_number}"
- )
- return False
- issue = self.ray_repo.get_issue(issue_number)
- issue.create_comment(
- f"Blamed commit: {blamed_commit} "
- f"found by bisect job https://buildkite.com/{BUILDKITE_ORGANIZATION}/"
- f"{BUILDKITE_BISECT_PIPELINE}/builds/{bisect_build_number}"
- )
- return True
- def _trigger_bisect(self) -> None:
- if self._bisect_rate_limit_exceeded():
- logger.info(f"Skip bisect {self.test.get_name()} due to rate limit")
- return
- test_type = self.test.get_test_type().value
- build = self.ray_buildkite.builds().create_build(
- BUILDKITE_ORGANIZATION,
- BUILDKITE_BISECT_PIPELINE,
- "HEAD",
- "master",
- message=f"[ray-test-bot] {self.test.get_name()} failing",
- env={
- "UPDATE_TEST_STATE_MACHINE": "1",
- "RAYCI_TEST_TYPE": test_type,
- },
- )
- failing_commit = self.test_results[0].commit
- passing_commits = [r.commit for r in self.test_results if r.is_passing()]
- if not passing_commits:
- logger.info(f"Skip bisect {self.test.get_name()} due to no passing commit")
- return
- passing_commit = passing_commits[0]
- self.ray_buildkite.jobs().unblock_job(
- BUILDKITE_ORGANIZATION,
- BUILDKITE_BISECT_PIPELINE,
- build["number"],
- build["jobs"][0]["id"], # first job is the blocked job
- fields={
- "test-name": self.test.get_name(),
- "passing-commit": passing_commit,
- "failing-commit": failing_commit,
- "concurrency": "3",
- "run-per-commit": "1",
- "test-type": test_type,
- },
- )
- self.test[Test.KEY_BISECT_BUILD_NUMBER] = build["number"]
- def _bisect_rate_limit_exceeded(self) -> bool:
- """
- Check if we have exceeded the rate limit of bisects per day.
- """
- builds = self.ray_buildkite.builds().list_all_for_pipeline(
- BUILDKITE_ORGANIZATION,
- BUILDKITE_BISECT_PIPELINE,
- created_from=datetime.now() - timedelta(days=1),
- branch="master",
- )
- builds = [
- build
- for build in builds
- if build["env"].get("RAYCI_TEST_TYPE") == self.test.get_test_type().value
- ]
- return len(builds) >= self.test.get_bisect_daily_rate_limit()
|