ml.rayci.yml 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. group: ml tests
  2. steps:
  3. # builds
  4. - name: minbuild-ml
  5. label: "wanda: minbuild-ml-py39"
  6. wanda: ci/docker/min.build.wanda.yaml
  7. depends_on: oss-ci-base_build
  8. env:
  9. PYTHON_VERSION: "3.9"
  10. EXTRA_DEPENDENCY: ml
  11. - name: mlbuild
  12. wanda: ci/docker/ml.build.wanda.yaml
  13. depends_on: oss-ci-base_ml
  14. env:
  15. IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml
  16. IMAGE_TO: mlbuild
  17. RAYCI_IS_GPU_BUILD: "false"
  18. - name: mlbuild-multipy
  19. label: "wanda: mlbuild-py{{matrix}}"
  20. wanda: ci/docker/ml.build.wanda.yaml
  21. depends_on: oss-ci-base_ml-multipy
  22. env:
  23. IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml-py{{matrix}}
  24. IMAGE_TO: mlbuild-py{{matrix}}
  25. PYTHON: "{{matrix}}"
  26. RAYCI_IS_GPU_BUILD: "false"
  27. matrix:
  28. - "3.12"
  29. - name: mllightning2gpubuild
  30. wanda: ci/docker/mllightning2gpu.build.wanda.yaml
  31. depends_on: oss-ci-base_gpu
  32. - name: mlgpubuild
  33. wanda: ci/docker/ml.build.wanda.yaml
  34. depends_on: oss-ci-base_gpu
  35. env:
  36. IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu
  37. IMAGE_TO: mlgpubuild
  38. RAYCI_IS_GPU_BUILD: "true"
  39. - name: mlgpubuild-multipy
  40. label: "wanda: mlgpubuild-py{{matrix}}"
  41. wanda: ci/docker/ml.build.wanda.yaml
  42. depends_on: oss-ci-base_gpu-multipy
  43. env:
  44. IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu-py{{matrix}}
  45. IMAGE_TO: mlgpubuild-py{{matrix}}
  46. PYTHON: "{{matrix}}"
  47. RAYCI_IS_GPU_BUILD: "true"
  48. matrix:
  49. - "3.12"
  50. # tests
  51. - label: ":train: ml: train tests"
  52. tags: train
  53. instance_type: large
  54. parallelism: 2
  55. commands:
  56. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
  57. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
  58. --except-tags gpu_only,gpu,minimal,tune,doctest,needs_credentials
  59. depends_on: [ "mlbuild", "forge" ]
  60. - label: ":train: ml: {{matrix.python}} tests ({{matrix.worker_id}})"
  61. if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
  62. tags:
  63. - python
  64. - train
  65. - tune
  66. - ml
  67. instance_type: large
  68. commands:
  69. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/tune/... //python/ray/air/... ml
  70. --workers 4 --worker-id {{matrix.worker_id}} --parallelism-per-worker 3
  71. --python-version {{matrix.python}}
  72. --except-tags gpu_only,gpu,minimal,doctest,needs_credentials,soft_imports,rllib,multinode
  73. depends_on:
  74. - mlbuild-multipy
  75. - forge
  76. job_env:
  77. matrix:
  78. setup:
  79. python: ["3.12"]
  80. worker_id: ["0", "1", "2", "3"]
  81. - label: ":train: ml: train gpu tests"
  82. tags:
  83. - train
  84. - gpu
  85. instance_type: gpu-large
  86. parallelism: 2
  87. commands:
  88. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/air/... //doc/... ml
  89. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 2
  90. --build-name mlgpubuild
  91. --only-tags gpu,gpu_only
  92. depends_on: [ "mlgpubuild", "forge" ]
  93. - label: ":train: ml: train gpu {{matrix.python}} tests ({{matrix.worker_id}})"
  94. if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
  95. tags:
  96. - train
  97. - gpu
  98. instance_type: gpu-large
  99. commands:
  100. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/air/... //doc/... ml
  101. --workers 2 --worker-id {{matrix.worker_id}} --parallelism-per-worker 2
  102. --python-version {{matrix.python}}
  103. --build-name mlgpubuild-py{{matrix.python}}
  104. --only-tags gpu,gpu_only
  105. --except-tags doctest
  106. depends_on: [ "mlgpubuild-multipy", "forge" ]
  107. matrix:
  108. setup:
  109. python: ["3.12"]
  110. worker_id: ["0", "1"]
  111. - label: ":train: ml: train authentication tests"
  112. tags:
  113. - train
  114. - branch
  115. - skip-on-premerge
  116. - oss
  117. instance_type: medium
  118. commands:
  119. - pip install -U boto3==1.28.70 awscli==1.29.70
  120. - $(python ci/env/setup_credentials.py)
  121. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
  122. --parallelism-per-worker 3
  123. --only-tags needs_credentials
  124. --test-env=WANDB_API_KEY --test-env=COMET_API_KEY
  125. depends_on: [ "mlbuild", "forge" ]
  126. - label: ":train: ml: tune tests"
  127. tags: tune
  128. instance_type: large
  129. commands:
  130. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
  131. --parallelism-per-worker 3
  132. --except-tags soft_imports,gpu_only,rllib,multinode
  133. depends_on: [ "mlbuild", "forge" ]
  134. - label: ":train: ml: tune soft import tests"
  135. tags: tune
  136. instance_type: small
  137. commands:
  138. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
  139. --only-tags soft_imports
  140. --build-name oss-ci-base_build
  141. depends_on: [ "oss-ci-base_build", "forge" ]
  142. - label: ":train: ml: air tests"
  143. tags: ml
  144. instance_type: large
  145. commands:
  146. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/air/... ml
  147. --parallelism-per-worker 3
  148. --except-tags gpu
  149. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... ml
  150. --parallelism-per-worker 3
  151. --only-tags ray_air
  152. --skip-ray-installation
  153. depends_on: [ "mlbuild", "forge" ]
  154. - label: ":train: ml: train+tune tests"
  155. tags: train
  156. instance_type: medium
  157. commands:
  158. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
  159. --parallelism-per-worker 3
  160. --only-tags tune
  161. --except-tags gpu_only,ray_air,gpu,doctest,needs_credentials
  162. depends_on: [ "mlbuild", "forge" ]
  163. - label: ":train: ml: rllib+tune tests"
  164. tags:
  165. - tune
  166. - rllib
  167. instance_type: large
  168. commands:
  169. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
  170. --parallelism-per-worker 3
  171. --only-tags rllib
  172. --except-tags gpu_only
  173. depends_on: [ "mlbuild", "forge" ]
  174. - label: ":train: ml: release tests"
  175. tags:
  176. - ml
  177. - python
  178. - release_tests
  179. instance_type: large
  180. commands:
  181. - bazel run //ci/ray_ci:test_in_docker -- //release/... ml
  182. --parallelism-per-worker 3
  183. depends_on: [ "mlbuild", "forge" ]
  184. - label: ":train: ml: train minimal"
  185. tags: train
  186. instance_type: small
  187. commands:
  188. - python ./ci/env/check_minimal_install.py
  189. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
  190. --parallelism-per-worker 2
  191. --build-name minbuild-ml-py3.9
  192. --only-tags minimal
  193. depends_on: [ "minbuild-ml", "forge" ]
  194. - label: ":train: ml: tune multinode tests"
  195. tags: tune
  196. instance_type: medium
  197. commands:
  198. - bazel run //ci/ray_ci:build_in_docker -- docker
  199. --platform cpu --image-type ray --canonical-tag multinode
  200. - python ./ci/build/build-multinode-image.py rayproject/ray:multinode rayproject/ray:multinode
  201. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
  202. --only-tags multinode
  203. --test-env=RAY_HAS_SSH=1
  204. --test-env=RAY_DOCKER_IMAGE=rayproject/ray:multinode
  205. --test-env=RAY_TEMPDIR="/ray-mount"
  206. --test-env=RAY_HOSTDIR="$${RAYCI_CHECKOUT_DIR}"
  207. --test-env=RAY_TESTHOST="rayci.localhost"
  208. depends_on:
  209. - manylinux
  210. - forge
  211. - raycpubase
  212. - mlbuild
  213. - label: ":train: ml: doc tests"
  214. tags:
  215. - train
  216. - tune
  217. - doc
  218. instance_type: large
  219. commands:
  220. # doc tests
  221. - bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... ml
  222. --only-tags doctest
  223. --except-tags gpu
  224. --parallelism-per-worker 3
  225. # doc examples
  226. - bazel run //ci/ray_ci:test_in_docker -- //doc/... ml
  227. --except-tags gpu,post_wheel_build,doctest,highly_parallel
  228. --parallelism-per-worker 3
  229. --skip-ray-installation
  230. depends_on: [ "mlbuild", "forge" ]
  231. - label: ":train: ml: train gpu lightning 2.0 tests"
  232. tags:
  233. - train
  234. - gpu
  235. instance_type: gpu-large
  236. commands:
  237. - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
  238. --build-name mllightning2gpubuild
  239. --only-tags ptl_v2
  240. depends_on: [ "mllightning2gpubuild", "forge" ]
  241. - label: ":train: ml: flaky tests"
  242. key: ml_flaky_tests
  243. tags:
  244. - train
  245. - skip-on-premerge
  246. instance_type: large
  247. commands:
  248. - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
  249. --parallelism-per-worker 2
  250. --except-tags gpu_only,gpu,needs_credentials
  251. depends_on: [ "mlbuild", "forge" ]
  252. soft_fail: true
  253. - label: ":train: ml: flaky authentication tests"
  254. tags:
  255. - train
  256. - branch
  257. - skip-on-premerge
  258. - oss
  259. instance_type: medium
  260. commands:
  261. - pip install -U boto3==1.28.70 awscli==1.29.70
  262. - $(python ci/env/setup_credentials.py)
  263. - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
  264. --parallelism-per-worker 3
  265. --only-tags needs_credentials
  266. --test-env=WANDB_API_KEY --test-env=COMET_API_KEY
  267. depends_on: [ "mlbuild", "forge" ]
  268. soft_fail: true
  269. - label: ":train: ml: train gpu flaky tests"
  270. key: ml_flaky_gpu_tests
  271. tags:
  272. - train
  273. - skip-on-premerge
  274. - gpu
  275. instance_type: gpu-large
  276. commands:
  277. - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
  278. --parallelism-per-worker 2
  279. --build-name mlgpubuild
  280. --only-tags gpu,gpu_only
  281. depends_on: [ "mlgpubuild", "forge" ]
  282. soft_fail: true