run_release_test.sh 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #!/bin/bash
  2. set -e
  3. if [ -n "$DEBUG" ]; then
  4. set -x
  5. fi
  6. cd "${0%/*}" || exit 1
  7. reason() {
  8. # Keep in sync with e2e.py ExitCode enum
  9. if [ "$1" -eq 0 ]; then
  10. REASON="success"
  11. elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then
  12. REASON="runtime error"
  13. elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then
  14. REASON="infra error"
  15. elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then
  16. REASON="infra timeout"
  17. elif [ "$1" -eq 42 ]; then
  18. REASON="command timeout"
  19. elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then
  20. REASON="command error"
  21. fi
  22. echo "${REASON}"
  23. }
  24. RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
  25. RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
  26. RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
  27. RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
  28. export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
  29. if [ -z "${NO_INSTALL}" ]; then
  30. pip uninstall -q -y ray
  31. pip install -q -r requirements.txt
  32. pip install -q -U boto3 botocore
  33. fi
  34. if [ -z "${NO_CLONE}" ]; then
  35. TMPDIR=$(mktemp -d -t release-XXXXXXXXXX)
  36. git clone --depth 1 -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}"
  37. pushd "${TMPDIR}/release" || true
  38. fi
  39. if [ -z "${NO_INSTALL}" ]; then
  40. pip install -e .
  41. fi
  42. RETRY_NUM=0
  43. MAX_RETRIES=${MAX_RETRIES-1}
  44. if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
  45. echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
  46. MAX_RETRIES=1
  47. fi
  48. ALL_EXIT_CODES=()
  49. while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
  50. RETRY_NUM=$((RETRY_NUM + 1))
  51. if [ "$RETRY_NUM" -gt 1 ]; then
  52. # Sleep for random time between 30 and 90 minutes
  53. SLEEP_TIME=$((1800 + RANDOM % 5400))
  54. if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then
  55. SLEEP_TIME=${OVERRIDE_SLEEP_TIME}
  56. fi
  57. echo "----------------------------------------"
  58. echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
  59. echo "----------------------------------------"
  60. sleep "${SLEEP_TIME}"
  61. fi
  62. if [ -z "${NO_ARTIFACTS}" ]; then
  63. sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
  64. fi
  65. set +e
  66. python "${RAY_TEST_SCRIPT}" "$@"
  67. EXIT_CODE=$?
  68. set -e
  69. REASON=$(reason "${EXIT_CODE}")
  70. ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
  71. case ${EXIT_CODE} in
  72. 0)
  73. echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
  74. break
  75. ;;
  76. 30 | 31 | 32 | 33)
  77. echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
  78. ;;
  79. *)
  80. echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
  81. break
  82. ;;
  83. esac
  84. done
  85. if [ -z "${NO_ARTIFACTS}" ]; then
  86. sudo rm -rf /tmp/ray_release_test_artifacts/* || true
  87. sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
  88. fi
  89. echo "----------------------------------------"
  90. echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
  91. echo "Run results:"
  92. COUNTER=1
  93. for EX in "${ALL_EXIT_CODES[@]}"; do
  94. REASON=$(reason "${EX}")
  95. echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
  96. COUNTER=$((COUNTER + 1))
  97. done
  98. echo "----------------------------------------"
  99. REASON=$(reason "${EXIT_CODE}")
  100. echo "Final release test exit code is ${EXIT_CODE} (${REASON})"
  101. if [ "$EXIT_CODE" -eq 0 ]; then
  102. echo "RELEASE MANAGER: This test seems to have passed."
  103. elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then
  104. echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
  105. else
  106. echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
  107. fi
  108. if [ -z "${NO_CLONE}" ]; then
  109. popd || true
  110. rm -rf "${TMPDIR}" || true
  111. fi
  112. exit $EXIT_CODE