rllib.rayci.yml 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. group: rllib tests
  2. depends_on:
  3. - forge
  4. steps:
  5. # builds
  6. - name: rllibbuild
  7. wanda: ci/docker/rllib.build.wanda.yaml
  8. depends_on: oss-ci-base_ml
  9. env:
  10. IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml
  11. IMAGE_TO: rllibbuild
  12. RAYCI_IS_GPU_BUILD: "false"
  13. - name: rllibgpubuild
  14. wanda: ci/docker/rllib.build.wanda.yaml
  15. depends_on: oss-ci-base_gpu
  16. env:
  17. IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu
  18. IMAGE_TO: rllibgpubuild
  19. RAYCI_IS_GPU_BUILD: "true"
  20. # tests
  21. - label: ":brain: rllib: algorithm, model and others"
  22. tags: rllib_directly
  23. parallelism: 4
  24. instance_type: large
  25. commands:
  26. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  27. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
  28. --except-tags learning_tests,memory_leak_tests,examples,tests_dir,documentation,multi_gpu,no_cpu,torch_2.x_only_benchmark,manual
  29. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  30. depends_on: rllibbuild
  31. - label: ":brain: rllib: learning tests pytorch"
  32. tags: rllib
  33. parallelism: 5
  34. instance_type: large
  35. commands:
  36. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  37. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
  38. --only-tags fake_gpus,learning_tests_discrete,crashing_cartpole,stateless_cartpole,learning_tests_continuous
  39. --except-tags tf_only,tf2_only,gpu,multi_gpu,learning_tests_pytorch_use_all_core
  40. --test-arg --framework=torch
  41. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  42. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
  43. --only-tags learning_tests_pytorch_use_all_core
  44. --except-tags tf_only,tf2_only,gpu,multi_gpu
  45. --test-arg --framework=torch
  46. --skip-ray-installation
  47. depends_on: rllibbuild
  48. - label: ":brain: rllib: examples"
  49. tags: rllib
  50. parallelism: 5
  51. instance_type: large
  52. commands:
  53. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  54. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 2
  55. --only-tags examples
  56. --except-tags multi_gpu,gpu,examples_use_all_core
  57. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  58. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  59. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
  60. --only-tags examples_use_all_core
  61. --skip-ray-installation
  62. --except-tags multi_gpu,gpu
  63. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  64. depends_on: rllibbuild
  65. - label: ":brain: rllib: tests dir"
  66. tags: rllib_directly
  67. parallelism: 2
  68. instance_type: large
  69. commands:
  70. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  71. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
  72. --only-tags tests_dir
  73. --except-tags multi_gpu,manual
  74. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  75. depends_on: rllibbuild
  76. - label: ":brain: rllib: gpu tests"
  77. tags:
  78. - rllib_gpu
  79. - gpu
  80. parallelism: 5
  81. instance_type: gpu
  82. commands:
  83. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  84. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
  85. --build-name rllibgpubuild
  86. --only-tags gpu
  87. --test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  88. --test-env=RLLIB_NUM_GPUS=1
  89. depends_on: rllibgpubuild
  90. - label: ":brain: rllib: data tests"
  91. if: build.branch != "master"
  92. tags:
  93. - data
  94. - rllib
  95. instance_type: large
  96. commands:
  97. # learning tests pytorch
  98. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  99. --parallelism-per-worker 3
  100. --only-tags learning_tests_with_ray_data
  101. --except-tags multi_gpu,gpu,tf_only,tf2_only
  102. --test-arg --framework=torch
  103. # rllib unittests
  104. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  105. --parallelism-per-worker 3
  106. --only-tags ray_data
  107. --except-tags learning_tests_with_ray_data,multi_gpu,gpu
  108. --skip-ray-installation # reuse the same docker image as the previous run
  109. depends_on: rllibbuild
  110. - label: ":brain: rllib: benchmarks"
  111. tags: rllib
  112. instance_type: medium
  113. commands:
  114. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --only-tags torch_2.x_only_benchmark
  115. depends_on: rllibbuild
  116. - label: ":brain: rllib: memory leak pytorch tests"
  117. tags: rllib
  118. instance_type: medium
  119. commands:
  120. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  121. --only-tags memory_leak_tests
  122. --except-tags flaky
  123. --test-arg --framework=torch
  124. depends_on: rllibbuild
  125. - label: ":brain: rllib: doc tests"
  126. tags:
  127. - rllib_directly
  128. - doc
  129. instance_type: medium
  130. commands:
  131. # doc tests
  132. - bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... rllib
  133. --except-tags gpu
  134. --only-tags doctest
  135. --parallelism-per-worker 2
  136. # doc examples
  137. - bazel run //ci/ray_ci:test_in_docker -- //doc/... rllib
  138. --except-tags gpu,post_wheel_build,timeseries_libs,doctest
  139. --parallelism-per-worker 2
  140. --skip-ray-installation
  141. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  142. --only-tags documentation
  143. --parallelism-per-worker 2
  144. --skip-ray-installation
  145. depends_on: rllibbuild
  146. - label: ":brain: rllib: multi-gpu tests"
  147. tags:
  148. - rllib_gpu
  149. - gpu
  150. parallelism: 5
  151. instance_type: gpu-large
  152. commands:
  153. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  154. --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
  155. --parallelism-per-worker 2
  156. --gpus 4
  157. --build-name rllibgpubuild
  158. --only-tags multi_gpu
  159. depends_on: rllibgpubuild
  160. - label: ":brain: rllib: flaky multi-gpu tests"
  161. key: rllib_flaky_multi_gpu_tests
  162. tags:
  163. - rllib_gpu
  164. - gpu
  165. - skip-on-premerge
  166. instance_type: gpu-large
  167. commands:
  168. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
  169. --parallelism-per-worker 2
  170. --gpus 4
  171. --build-name rllibgpubuild
  172. --only-tags multi_gpu
  173. depends_on: rllibgpubuild
  174. soft_fail: true
  175. - label: ":brain: rllib: flaky gpu tests"
  176. key: rllib_flaky_gpu_tests
  177. tags:
  178. - rllib_gpu
  179. - gpu
  180. - skip-on-premerge
  181. instance_type: gpu
  182. commands:
  183. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
  184. --build-name rllibgpubuild
  185. --only-tags gpu
  186. --test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  187. --test-env=RLLIB_NUM_GPUS=1
  188. depends_on: rllibgpubuild
  189. soft_fail: true
  190. - label: ":brain: rllib: flaky tests (learning tests)"
  191. key: rllib_flaky_tests_01
  192. tags:
  193. - rllib
  194. - skip-on-premerge
  195. instance_type: large
  196. commands:
  197. # torch
  198. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
  199. --only-tags fake_gpus,learning_tests_discrete,learning_tests_with_ray_data,crashing_cartpole,stateless_cartpole,learning_tests_continuous
  200. --except-tags tf_only,tf2_only,multi_gpu,gpu
  201. --test-arg --framework=torch
  202. # tf2-static-graph
  203. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
  204. --only-tags tf_only
  205. --except-tags torch_only,tf2_only,no_tf_static_graph,multi_gpu,gpu
  206. --test-arg --framework=tf
  207. --skip-ray-installation # reuse the same docker image as the previous run
  208. # tf2-eager-tracing
  209. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
  210. --only-tags tf2_only
  211. --except-tags fake_gpus,torch_only,multi_gpu,no_tf_eager_tracing,gpu
  212. --test-arg --framework=tf2
  213. --skip-ray-installation # reuse the same docker image as the previous run
  214. depends_on: rllibbuild
  215. soft_fail: true
  216. - label: ":brain: rllib: flaky tests (examples/rlmodule/models/tests_dir)"
  217. key: rllib_flaky_tests_02
  218. tags:
  219. - rllib
  220. - skip-on-premerge
  221. instance_type: large
  222. commands:
  223. # examples
  224. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
  225. --only-tags examples
  226. --except-tags multi_gpu,gpu
  227. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  228. # rlmodule tests
  229. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
  230. --only-tags rlm
  231. --except-tags multi_gpu,gpu
  232. --test-env RLLIB_ENABLE_RL_MODULE=1
  233. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  234. --skip-ray-installation # reuse the same docker image as the previous run
  235. # algorithm, models
  236. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
  237. --except-tags learning_tests,memory_leak_tests,examples,tests_dir,documentation,multi_gpu,gpu,no_cpu,torch_2.x_only_benchmark,manual
  238. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  239. --skip-ray-installation # reuse the same docker image as the previous run
  240. # tests/ dir
  241. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
  242. --only-tags tests_dir
  243. --except-tags multi_gpu,gpu,manual
  244. --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
  245. --skip-ray-installation # reuse the same docker image as the previous run
  246. depends_on: rllibbuild
  247. soft_fail: true
  248. - label: ":brain: rllib: flaky tests (memory leak)"
  249. key: rllib_flaky_tests_03
  250. tags:
  251. - rllib
  252. - skip-on-premerge
  253. instance_type: medium
  254. commands:
  255. - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
  256. --run-flaky-tests
  257. --only-tags memory_leak_tests
  258. --except-tags flaky,gpu,multi_gpu
  259. --test-arg --framework=tf2
  260. depends_on: rllibbuild
  261. soft_fail: true