run_release_test.sh 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #!/bin/bash
  2. # shellcheck disable=SC2317
  3. set -e
  4. if [ -n "$DEBUG" ]; then
  5. set -x
  6. fi
  7. cd "${0%/*}" || exit 1
  8. reason() {
  9. # Keep in sync with e2e.py ExitCode enum
  10. if [ "$1" -eq 0 ]; then
  11. REASON="success"
  12. elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then
  13. REASON="runtime error"
  14. elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then
  15. REASON="infra error"
  16. elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then
  17. REASON="infra timeout"
  18. elif [ "$1" -eq 42 ]; then
  19. REASON="command timeout"
  20. elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then
  21. REASON="command error"
  22. fi
  23. echo "${REASON}"
  24. }
  25. RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-"python ray_release/scripts/run_release_test.py"}
  26. RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
  27. BUILDKITE_MAX_RETRIES=1
  28. BUILDKITE_RETRY_CODE=79
  29. BUILDKITE_TIME_LIMIT_FOR_RETRY=10800 # 3 hours
  30. export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR BUILDKITE_MAX_RETRIES BUILDKITE_RETRY_CODE BUILDKITE_TIME_LIMIT_FOR_RETRY
  31. if [ -n "${RAY_COMMIT_OF_WHEEL-}" ]; then
  32. git config --global --add safe.directory /workdir
  33. HEAD_COMMIT=$(git rev-parse HEAD)
  34. echo "The test repo has head commit of ${HEAD_COMMIT}"
  35. if [[ "${HEAD_COMMIT}" != "${RAY_COMMIT_OF_WHEEL}" ]]; then
  36. echo "The checked out test code doesn't match with the installed wheel. \
  37. This is likely due to a racing condition when a PR is landed between \
  38. a wheel is installed and test code is checked out."
  39. echo "Hard resetting from ${HEAD_COMMIT} to ${RAY_COMMIT_OF_WHEEL}."
  40. git reset --hard "${RAY_COMMIT_OF_WHEEL}"
  41. fi
  42. fi
  43. if [ -z "${NO_INSTALL}" ]; then
  44. pip install -r ./requirements_buildkite.txt
  45. pip install --no-deps -e .
  46. fi
  47. RETRY_NUM=0
  48. MAX_RETRIES=${MAX_RETRIES-1}
  49. if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
  50. echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
  51. MAX_RETRIES=1
  52. fi
  53. ALL_EXIT_CODES=()
  54. while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
  55. RETRY_NUM=$((RETRY_NUM + 1))
  56. if [ "$RETRY_NUM" -gt 1 ]; then
  57. # Sleep for random time between 30 and 90 minutes
  58. SLEEP_TIME=$((1800 + RANDOM % 5400))
  59. if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then
  60. SLEEP_TIME=${OVERRIDE_SLEEP_TIME}
  61. fi
  62. echo "----------------------------------------"
  63. echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
  64. echo "----------------------------------------"
  65. sleep "${SLEEP_TIME}"
  66. fi
  67. if [ -z "${NO_ARTIFACTS}" ]; then
  68. rm -rf "${RELEASE_RESULTS_DIR:?}"/* || true
  69. fi
  70. _term() {
  71. echo "[SCRIPT $(date +'%Y-%m-%d %H:%M:%S'),...] Caught SIGTERM signal, sending SIGTERM to release test script"
  72. kill "$proc"
  73. wait "$proc"
  74. }
  75. START=$(date +%s)
  76. set +e
  77. trap _term SIGINT SIGTERM
  78. ${RAY_TEST_SCRIPT} "$@" &
  79. proc=$!
  80. wait "$proc"
  81. EXIT_CODE=$?
  82. set -e
  83. END=$(date +%s)
  84. REASON=$(reason "${EXIT_CODE}")
  85. RUNTIME=$((END-START))
  86. ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
  87. case ${EXIT_CODE} in
  88. 0)
  89. echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
  90. break
  91. ;;
  92. 30 | 31 | 32 | 33)
  93. echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
  94. ;;
  95. *)
  96. echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
  97. break
  98. ;;
  99. esac
  100. done
  101. if [ -z "${NO_ARTIFACTS}" ]; then
  102. rm -rf /tmp/ray_release_test_artifacts/* || true
  103. cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
  104. fi
  105. echo "----------------------------------------"
  106. echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
  107. echo "Run results:"
  108. COUNTER=1
  109. for EX in "${ALL_EXIT_CODES[@]}"; do
  110. REASON=$(reason "${EX}")
  111. echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
  112. COUNTER=$((COUNTER + 1))
  113. done
  114. echo "----------------------------------------"
  115. REASON=$(reason "${EXIT_CODE}")
  116. echo "Final release test exit code is ${EXIT_CODE} (${REASON}). Took ${RUNTIME}s"
  117. if [ "$EXIT_CODE" -eq 0 ]; then
  118. echo "RELEASE MANAGER: This test seems to have passed."
  119. elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then
  120. echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
  121. else
  122. echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
  123. fi
  124. if [[ "$EXIT_CODE" -ne 0 && "$RUNTIME" -le "$BUILDKITE_TIME_LIMIT_FOR_RETRY" ]]; then
  125. exit "$BUILDKITE_RETRY_CODE"
  126. else
  127. exit "$EXIT_CODE"
  128. fi