run_release_test.sh 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. #!/bin/bash
  2. # shellcheck disable=SC2317
  3. set -e
  4. if [ -n "$DEBUG" ]; then
  5. set -x
  6. fi
  7. cd "${0%/*}" || exit 1
  8. reason() {
  9. # Keep in sync with e2e.py ExitCode enum
  10. if [ "$1" -eq 0 ]; then
  11. REASON="success"
  12. elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then
  13. REASON="runtime error"
  14. elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then
  15. REASON="infra error"
  16. elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then
  17. REASON="infra timeout"
  18. elif [ "$1" -eq 42 ]; then
  19. REASON="command timeout"
  20. elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then
  21. REASON="command error"
  22. fi
  23. echo "${REASON}"
  24. }
  25. RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
  26. RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
  27. RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
  28. RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
  29. BUILDKITE_MAX_RETRIES=1
  30. BUILDKITE_RETRY_CODE=79
  31. BUILDKITE_TIME_LIMIT_FOR_RETRY=1800
  32. # This is not a great idea if your OS is different to the one
  33. # used in the product clusters. However, we need this in CI as reloading
  34. # Ray within the python process does not work for protobuf changes.
  35. INSTALL_MATCHING_RAY=${BUILDKITE-false}
  36. export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR BUILDKITE_MAX_RETRIES BUILDKITE_RETRY_CODE BUILDKITE_TIME_LIMIT_FOR_RETRY
  37. if [ -z "${NO_INSTALL}" ]; then
  38. pip install --use-deprecated=legacy-resolver -q -r requirements.txt
  39. pip install -q -U boto3 botocore bazel-runfiles
  40. if [ "${INSTALL_MATCHING_RAY-false}" == "true" ]; then
  41. # Find ray-wheels parameter and install locally
  42. i=1
  43. for arg in "$@"; do
  44. j=$((i+1))
  45. if [ "$arg" == "--ray-wheels" ]; then
  46. PARSED_RAY_WHEELS="${!j}"
  47. fi
  48. i=$j
  49. done
  50. if [ -n "${PARSED_RAY_WHEELS}" ]; then
  51. echo "Installing Ray wheels locally: ${PARSED_RAY_WHEELS}"
  52. pip install -U --force-reinstall "${PARSED_RAY_WHEELS}"
  53. else
  54. echo "Warning: No Ray wheels found to install locally"
  55. fi
  56. fi
  57. fi
  58. if [ -z "${NO_CLONE}" ]; then
  59. TMPDIR=$(mktemp -d -t release-XXXXXXXXXX)
  60. echo "Cloning test repo ${RAY_TEST_REPO} branch ${RAY_TEST_BRANCH}"
  61. git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}"
  62. pushd "${TMPDIR}/release" || true
  63. HEAD_COMMIT=$(git rev-parse HEAD)
  64. echo "The cloned test repo has head commit of ${HEAD_COMMIT}"
  65. # We only do this if RAY_TEST_REPO and RAY_TEST_BRANCH are pointing to ray master.
  66. # Theoretically, release manager may also run into this issue when manually triggering
  67. # release test runs. But cherry-picks are rare and thus it's less likely to run into
  68. # this racing condition, ignoring for now.
  69. if [ "${RAY_TEST_REPO}" == "https://github.com/ray-project/ray.git" ] && \
  70. [[ "${PARSED_RAY_WHEELS}" == *"master"* ]] && \
  71. [ "${RAY_TEST_BRANCH-}" == "master" ] && [ -n "${RAY_COMMIT_OF_WHEEL-}" ] && \
  72. [ "${HEAD_COMMIT}" != "${RAY_COMMIT_OF_WHEEL}" ]; then
  73. echo "The checked out test code doesn't match with the installed wheel. \
  74. This is likely due to a racing condition when a PR is landed between \
  75. a wheel is installed and test code is checked out."
  76. echo "Hard resetting from ${HEAD_COMMIT} to ${RAY_COMMIT_OF_WHEEL}."
  77. git reset --hard "${RAY_COMMIT_OF_WHEEL}"
  78. fi
  79. fi
  80. if [ -z "${NO_INSTALL}" ]; then
  81. pip install --use-deprecated=legacy-resolver -c requirements.txt -e .
  82. fi
  83. RETRY_NUM=0
  84. MAX_RETRIES=${MAX_RETRIES-1}
  85. if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
  86. echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
  87. MAX_RETRIES=1
  88. fi
  89. ALL_EXIT_CODES=()
  90. while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
  91. RETRY_NUM=$((RETRY_NUM + 1))
  92. if [ "$RETRY_NUM" -gt 1 ]; then
  93. # Sleep for random time between 30 and 90 minutes
  94. SLEEP_TIME=$((1800 + RANDOM % 5400))
  95. if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then
  96. SLEEP_TIME=${OVERRIDE_SLEEP_TIME}
  97. fi
  98. echo "----------------------------------------"
  99. echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
  100. echo "----------------------------------------"
  101. sleep "${SLEEP_TIME}"
  102. fi
  103. if [ -z "${NO_ARTIFACTS}" ]; then
  104. sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
  105. fi
  106. _term() {
  107. echo "[SCRIPT $(date +'%Y-%m-%d %H:%M:%S'),...] Caught SIGTERM signal, sending SIGTERM to release test script"
  108. kill "$proc"
  109. wait "$proc"
  110. }
  111. START=$(date +%s)
  112. set +e
  113. trap _term SIGINT SIGTERM
  114. python "${RAY_TEST_SCRIPT}" "$@" &
  115. proc=$!
  116. wait "$proc"
  117. EXIT_CODE=$?
  118. set -e
  119. END=$(date +%s)
  120. REASON=$(reason "${EXIT_CODE}")
  121. RUNTIME=$((END-START))
  122. ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
  123. case ${EXIT_CODE} in
  124. 0)
  125. echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
  126. break
  127. ;;
  128. 30 | 31 | 32 | 33)
  129. echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
  130. ;;
  131. *)
  132. echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
  133. break
  134. ;;
  135. esac
  136. done
  137. if [ -z "${NO_ARTIFACTS}" ]; then
  138. sudo rm -rf /tmp/ray_release_test_artifacts/* || true
  139. sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
  140. fi
  141. echo "----------------------------------------"
  142. echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
  143. echo "Run results:"
  144. COUNTER=1
  145. for EX in "${ALL_EXIT_CODES[@]}"; do
  146. REASON=$(reason "${EX}")
  147. echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
  148. COUNTER=$((COUNTER + 1))
  149. done
  150. echo "----------------------------------------"
  151. REASON=$(reason "${EXIT_CODE}")
  152. echo "Final release test exit code is ${EXIT_CODE} (${REASON}). Took ${RUNTIME}s"
  153. if [ "$EXIT_CODE" -eq 0 ]; then
  154. echo "RELEASE MANAGER: This test seems to have passed."
  155. elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then
  156. echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
  157. else
  158. echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
  159. fi
  160. if [ -z "${NO_CLONE}" ]; then
  161. popd || true
  162. rm -rf "${TMPDIR}" || true
  163. fi
  164. if [[ ("$REASON" == "infra error" || "$REASON" == "infra timeout") && ("$RUNTIME" -le "$BUILDKITE_TIME_LIMIT_FOR_RETRY") ]]; then
  165. exit "$BUILDKITE_RETRY_CODE"
  166. else
  167. exit "$EXIT_CODE"
  168. fi