result.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import enum
  2. import os
  3. from dataclasses import dataclass
  4. from typing import Optional, Dict, Tuple
  5. class ResultStatus(enum.Enum):
  6. """
  7. Overall status of the result test run
  8. """
  9. SUCCESS = "success"
  10. UNKNOWN = "unknown"
  11. RUNTIME_ERROR = "runtime_error"
  12. TRANSIENT_INFRA_ERROR = "transient_infra_error"
  13. INFRA_ERROR = "infra_error"
  14. INFRA_TIMEOUT = "infra_timeout"
  15. ERROR = "error"
  16. TIMEOUT = "timeout"
  17. @dataclass
  18. class Result:
  19. results: Optional[Dict] = None
  20. status: str = ResultStatus.UNKNOWN.value
  21. return_code: int = 0
  22. last_logs: Optional[str] = None
  23. runtime: Optional[float] = None
  24. stable: bool = True
  25. smoke_test: bool = False
  26. buildkite_url: Optional[str] = None
  27. wheels_url: Optional[str] = None
  28. cluster_url: Optional[str] = None
  29. # Anyscale Jobs specific
  30. job_url: Optional[str] = None
  31. job_id: Optional[str] = None
  32. buildkite_job_id: Optional[str] = None
  33. cluster_id: Optional[str] = None
  34. prometheus_metrics: Optional[Dict] = None
  35. extra_tags: Optional[Dict] = None
  36. class ExitCode(enum.Enum):
  37. # If you change these, also change the `retry` section
  38. # in `build_pipeline.py` and the `reason()` function in `run_e2e.sh`
  39. SUCCESS = 0 # Do not set/return this manually
  40. UNCAUGHT = 1 # Do not set/return this manually
  41. UNSPECIFIED = 2
  42. UNKNOWN = 3
  43. # Hard infra errors (non-retryable)
  44. CLI_ERROR = 10
  45. CONFIG_ERROR = 11
  46. SETUP_ERROR = 12
  47. CLUSTER_RESOURCE_ERROR = 13
  48. CLUSTER_ENV_BUILD_ERROR = 14
  49. CLUSTER_STARTUP_ERROR = 15
  50. LOCAL_ENV_SETUP_ERROR = 16
  51. REMOTE_ENV_SETUP_ERROR = 17
  52. FETCH_RESULT_ERROR = 18
  53. ANYSCALE_ERROR = 19
  54. # Infra timeouts (retryable)
  55. RAY_WHEELS_TIMEOUT = 30
  56. CLUSTER_ENV_BUILD_TIMEOUT = 31
  57. CLUSTER_STARTUP_TIMEOUT = 32
  58. CLUSTER_WAIT_TIMEOUT = 33
  59. # Command errors - these are considered application errors
  60. COMMAND_ERROR = 40
  61. COMMAND_ALERT = 41
  62. COMMAND_TIMEOUT = 42
  63. PREPARE_ERROR = 43
  64. def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool:
  65. """
  66. Classify whether an infra-failure issue is a transient issue. This is based on
  67. the status of its previous retries, and its runtime.
  68. """
  69. retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0))
  70. max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1))
  71. if retry_count >= max_retry:
  72. # Already reach retry limit
  73. return False
  74. if runtime > int(os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0)):
  75. # Take too long to run
  76. return False
  77. return True
  78. def handle_exception(
  79. e: Exception, run_duration: int
  80. ) -> Tuple[ExitCode, ResultStatus, Optional[int]]:
  81. from ray_release.exception import ReleaseTestError
  82. if not isinstance(e, ReleaseTestError):
  83. return ExitCode.UNKNOWN, ResultStatus.UNKNOWN, 0
  84. exit_code = e.exit_code
  85. if 1 <= exit_code.value < 10:
  86. result_status = ResultStatus.RUNTIME_ERROR
  87. runtime = None
  88. elif 10 <= exit_code.value < 20:
  89. result_status = ResultStatus.INFRA_ERROR
  90. runtime = None
  91. elif 30 <= exit_code.value < 40:
  92. result_status = ResultStatus.INFRA_TIMEOUT
  93. runtime = None
  94. elif exit_code == ExitCode.COMMAND_TIMEOUT:
  95. result_status = ResultStatus.TIMEOUT
  96. runtime = 0
  97. elif 40 <= exit_code.value:
  98. result_status = ResultStatus.ERROR
  99. runtime = 0
  100. # if this result is to be retried, mark its status as transient
  101. # this logic should be in-sync with run_release_test.sh
  102. if _is_transient_error(result_status, run_duration):
  103. result_status = ResultStatus.TRANSIENT_INFRA_ERROR
  104. return exit_code, result_status, runtime