123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305 |
- group: ml tests
- steps:
- # builds
- - name: minbuild-ml
- label: "wanda: minbuild-ml-py39"
- wanda: ci/docker/min.build.wanda.yaml
- depends_on: oss-ci-base_build
- env:
- PYTHON_VERSION: "3.9"
- EXTRA_DEPENDENCY: ml
- - name: mlbuild
- wanda: ci/docker/ml.build.wanda.yaml
- depends_on: oss-ci-base_ml
- env:
- IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml
- IMAGE_TO: mlbuild
- RAYCI_IS_GPU_BUILD: "false"
- - name: mlbuild-multipy
- label: "wanda: mlbuild-py{{matrix}}"
- wanda: ci/docker/ml.build.wanda.yaml
- depends_on: oss-ci-base_ml-multipy
- env:
- IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml-py{{matrix}}
- IMAGE_TO: mlbuild-py{{matrix}}
- PYTHON: "{{matrix}}"
- RAYCI_IS_GPU_BUILD: "false"
- matrix:
- - "3.12"
- - name: mllightning2gpubuild
- wanda: ci/docker/mllightning2gpu.build.wanda.yaml
- depends_on: oss-ci-base_gpu
- - name: mlgpubuild
- wanda: ci/docker/ml.build.wanda.yaml
- depends_on: oss-ci-base_gpu
- env:
- IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu
- IMAGE_TO: mlgpubuild
- RAYCI_IS_GPU_BUILD: "true"
- - name: mlgpubuild-multipy
- label: "wanda: mlgpubuild-py{{matrix}}"
- wanda: ci/docker/ml.build.wanda.yaml
- depends_on: oss-ci-base_gpu-multipy
- env:
- IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu-py{{matrix}}
- IMAGE_TO: mlgpubuild-py{{matrix}}
- PYTHON: "{{matrix}}"
- RAYCI_IS_GPU_BUILD: "true"
- matrix:
- - "3.12"
- # tests
- - label: ":train: ml: train tests"
- tags: train
- instance_type: large
- parallelism: 2
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
- --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
- --except-tags gpu_only,gpu,minimal,tune,doctest,needs_credentials
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: {{matrix.python}} tests ({{matrix.worker_id}})"
- if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
- tags:
- - python
- - train
- - tune
- - ml
- instance_type: large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/tune/... //python/ray/air/... ml
- --workers 4 --worker-id {{matrix.worker_id}} --parallelism-per-worker 3
- --python-version {{matrix.python}}
- --except-tags gpu_only,gpu,minimal,doctest,needs_credentials,soft_imports,rllib,multinode
- depends_on:
- - mlbuild-multipy
- - forge
- job_env:
- matrix:
- setup:
- python: ["3.12"]
- worker_id: ["0", "1", "2", "3"]
- - label: ":train: ml: train gpu tests"
- tags:
- - train
- - gpu
- instance_type: gpu-large
- parallelism: 2
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/air/... //doc/... ml
- --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 2
- --build-name mlgpubuild
- --only-tags gpu,gpu_only
- depends_on: [ "mlgpubuild", "forge" ]
- - label: ":train: ml: train gpu {{matrix.python}} tests ({{matrix.worker_id}})"
- if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
- tags:
- - train
- - gpu
- instance_type: gpu-large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/air/... //doc/... ml
- --workers 2 --worker-id {{matrix.worker_id}} --parallelism-per-worker 2
- --python-version {{matrix.python}}
- --build-name mlgpubuild-py{{matrix.python}}
- --only-tags gpu,gpu_only
- --except-tags doctest
- depends_on: [ "mlgpubuild-multipy", "forge" ]
- matrix:
- setup:
- python: ["3.12"]
- worker_id: ["0", "1"]
- - label: ":train: ml: train authentication tests"
- tags:
- - train
- - branch
- - skip-on-premerge
- - oss
- instance_type: medium
- commands:
- - pip install -U boto3==1.28.70 awscli==1.29.70
- - $(python ci/env/setup_credentials.py)
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
- --parallelism-per-worker 3
- --only-tags needs_credentials
- --test-env=WANDB_API_KEY --test-env=COMET_API_KEY
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: tune tests"
- tags: tune
- instance_type: large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
- --parallelism-per-worker 3
- --except-tags soft_imports,gpu_only,rllib,multinode
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: tune soft import tests"
- tags: tune
- instance_type: small
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
- --only-tags soft_imports
- --build-name oss-ci-base_build
- depends_on: [ "oss-ci-base_build", "forge" ]
- - label: ":train: ml: air tests"
- tags: ml
- instance_type: large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/air/... ml
- --parallelism-per-worker 3
- --except-tags gpu
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... ml
- --parallelism-per-worker 3
- --only-tags ray_air
- --skip-ray-installation
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: train+tune tests"
- tags: train
- instance_type: medium
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
- --parallelism-per-worker 3
- --only-tags tune
- --except-tags gpu_only,ray_air,gpu,doctest,needs_credentials
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: rllib+tune tests"
- tags:
- - tune
- - rllib
- instance_type: large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
- --parallelism-per-worker 3
- --only-tags rllib
- --except-tags gpu_only
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: release tests"
- tags:
- - ml
- - python
- - release_tests
- instance_type: large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //release/... ml
- --parallelism-per-worker 3
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: train minimal"
- tags: train
- instance_type: small
- commands:
- - python ./ci/env/check_minimal_install.py
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
- --parallelism-per-worker 2
- --build-name minbuild-ml-py3.9
- --only-tags minimal
- depends_on: [ "minbuild-ml", "forge" ]
- - label: ":train: ml: tune multinode tests"
- tags: tune
- instance_type: medium
- commands:
- - bazel run //ci/ray_ci:build_in_docker -- docker
- --platform cpu --image-type ray --canonical-tag multinode
- - python ./ci/build/build-multinode-image.py rayproject/ray:multinode rayproject/ray:multinode
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
- --only-tags multinode
- --test-env=RAY_HAS_SSH=1
- --test-env=RAY_DOCKER_IMAGE=rayproject/ray:multinode
- --test-env=RAY_TEMPDIR="/ray-mount"
- --test-env=RAY_HOSTDIR="$${RAYCI_CHECKOUT_DIR}"
- --test-env=RAY_TESTHOST="rayci.localhost"
- depends_on:
- - manylinux
- - forge
- - raycpubase
- - mlbuild
-
- - label: ":train: ml: doc tests"
- tags:
- - train
- - tune
- - doc
- instance_type: large
- commands:
- # doc tests
- - bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... ml
- --only-tags doctest
- --except-tags gpu
- --parallelism-per-worker 3
- # doc examples
- - bazel run //ci/ray_ci:test_in_docker -- //doc/... ml
- --except-tags gpu,post_wheel_build,doctest,highly_parallel
- --parallelism-per-worker 3
- --skip-ray-installation
- depends_on: [ "mlbuild", "forge" ]
- - label: ":train: ml: train gpu lightning 2.0 tests"
- tags:
- - train
- - gpu
- instance_type: gpu-large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
- --build-name mllightning2gpubuild
- --only-tags ptl_v2
- depends_on: [ "mllightning2gpubuild", "forge" ]
- - label: ":train: ml: flaky tests"
- key: ml_flaky_tests
- tags:
- - train
- - skip-on-premerge
- instance_type: large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
- --parallelism-per-worker 2
- --except-tags gpu_only,gpu,needs_credentials
- depends_on: [ "mlbuild", "forge" ]
- soft_fail: true
- - label: ":train: ml: flaky authentication tests"
- tags:
- - train
- - branch
- - skip-on-premerge
- - oss
- instance_type: medium
- commands:
- - pip install -U boto3==1.28.70 awscli==1.29.70
- - $(python ci/env/setup_credentials.py)
- - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
- --parallelism-per-worker 3
- --only-tags needs_credentials
- --test-env=WANDB_API_KEY --test-env=COMET_API_KEY
- depends_on: [ "mlbuild", "forge" ]
- soft_fail: true
- - label: ":train: ml: train gpu flaky tests"
- key: ml_flaky_gpu_tests
- tags:
- - train
- - skip-on-premerge
- - gpu
- instance_type: gpu-large
- commands:
- - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
- --parallelism-per-worker 2
- --build-name mlgpubuild
- --only-tags gpu,gpu_only
- depends_on: [ "mlgpubuild", "forge" ]
- soft_fail: true
|