openoker
/
ray


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
							group: ml tests
steps:
  # builds
  - name: minbuild-ml
    label: "wanda: minbuild-ml-py39"
    wanda: ci/docker/min.build.wanda.yaml
    depends_on: oss-ci-base_build
    env:
      PYTHON_VERSION: "3.9"
      EXTRA_DEPENDENCY: ml

  - name: mlbuild
    wanda: ci/docker/ml.build.wanda.yaml
    depends_on: oss-ci-base_ml
    env:
      IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml
      IMAGE_TO: mlbuild
      RAYCI_IS_GPU_BUILD: "false"

  - name: mlbuild-multipy
    label: "wanda: mlbuild-py{{matrix}}"
    wanda: ci/docker/ml.build.wanda.yaml
    depends_on: oss-ci-base_ml-multipy
    env:
      IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml-py{{matrix}}
      IMAGE_TO: mlbuild-py{{matrix}}
      PYTHON: "{{matrix}}"
      RAYCI_IS_GPU_BUILD: "false"
    matrix:
      - "3.12"

  - name: mllightning2gpubuild
    wanda: ci/docker/mllightning2gpu.build.wanda.yaml
    depends_on: oss-ci-base_gpu

  - name: mlgpubuild
    wanda: ci/docker/ml.build.wanda.yaml
    depends_on: oss-ci-base_gpu
    env:
      IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu
      IMAGE_TO: mlgpubuild
      RAYCI_IS_GPU_BUILD: "true"

  - name: mlgpubuild-multipy
    label: "wanda: mlgpubuild-py{{matrix}}"
    wanda: ci/docker/ml.build.wanda.yaml
    depends_on: oss-ci-base_gpu-multipy
    env:
      IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu-py{{matrix}}
      IMAGE_TO: mlgpubuild-py{{matrix}}
      PYTHON: "{{matrix}}"
      RAYCI_IS_GPU_BUILD: "true"
    matrix:
      - "3.12"

  # tests
  - label: ":train: ml: train tests"
    tags: train
    instance_type: large
    parallelism: 2
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml 
        --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
        --except-tags gpu_only,gpu,minimal,tune,doctest,needs_credentials 
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: {{matrix.python}} tests ({{matrix.worker_id}})"
    if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
    tags: 
      - python
      - train
      - tune
      - ml
    instance_type: large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/tune/... //python/ray/air/... ml
        --workers 4 --worker-id {{matrix.worker_id}} --parallelism-per-worker 3
        --python-version {{matrix.python}}
        --except-tags gpu_only,gpu,minimal,doctest,needs_credentials,soft_imports,rllib,multinode
    depends_on: 
      - mlbuild-multipy
      - forge
    job_env: 
    matrix:
      setup:
        python: ["3.12"]
        worker_id: ["0", "1", "2", "3"]

  - label: ":train: ml: train gpu tests"
    tags: 
      - train
      - gpu
    instance_type: gpu-large
    parallelism: 2
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/air/... //doc/... ml
        --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 2
        --build-name mlgpubuild
        --only-tags gpu,gpu_only
    depends_on: [ "mlgpubuild", "forge" ]

  - label: ":train: ml: train gpu {{matrix.python}} tests ({{matrix.worker_id}})"
    if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
    tags: 
      - train
      - gpu
    instance_type: gpu-large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... //python/ray/air/... //doc/... ml
        --workers 2 --worker-id {{matrix.worker_id}} --parallelism-per-worker 2
        --python-version {{matrix.python}}
        --build-name mlgpubuild-py{{matrix.python}}
        --only-tags gpu,gpu_only
        --except-tags doctest
    depends_on: [ "mlgpubuild-multipy", "forge" ]
    matrix:
      setup:
        python: ["3.12"]
        worker_id: ["0", "1"]

  - label: ":train: ml: train authentication tests"
    tags:
      - train
      - branch
      - skip-on-premerge
      - oss
    instance_type: medium
    commands:
      - pip install -U boto3==1.28.70 awscli==1.29.70 
      - $(python ci/env/setup_credentials.py)
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml 
        --parallelism-per-worker 3
        --only-tags needs_credentials
        --test-env=WANDB_API_KEY --test-env=COMET_API_KEY
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: tune tests"
    tags: tune
    instance_type: large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml 
        --parallelism-per-worker 3
        --except-tags soft_imports,gpu_only,rllib,multinode
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: tune soft import tests"
    tags: tune
    instance_type: small
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml
        --only-tags soft_imports 
        --build-name oss-ci-base_build
    depends_on: [ "oss-ci-base_build", "forge" ]

  - label: ":train: ml: air tests"
    tags: ml
    instance_type: large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/air/... ml 
        --parallelism-per-worker 3
        --except-tags gpu
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... ml 
        --parallelism-per-worker 3
        --only-tags ray_air
        --skip-ray-installation
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: train+tune tests"
    tags: train
    instance_type: medium
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml 
        --parallelism-per-worker 3
        --only-tags tune
        --except-tags gpu_only,ray_air,gpu,doctest,needs_credentials
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: rllib+tune tests"
    tags: 
      - tune
      - rllib
    instance_type: large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml 
        --parallelism-per-worker 3
        --only-tags rllib
        --except-tags gpu_only
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: release tests"
    tags:
      - ml
      - python
      - release_tests
    instance_type: large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //release/... ml 
        --parallelism-per-worker 3
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: train minimal"
    tags: train
    instance_type: small
    commands:
      - python ./ci/env/check_minimal_install.py
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml
        --parallelism-per-worker 2
        --build-name minbuild-ml-py3.9
        --only-tags minimal
    depends_on: [ "minbuild-ml", "forge" ]

  - label: ":train: ml: tune multinode tests"
    tags: tune
    instance_type: medium
    commands:
      - bazel run //ci/ray_ci:build_in_docker -- docker
        --platform cpu --image-type ray --canonical-tag multinode
      - python ./ci/build/build-multinode-image.py rayproject/ray:multinode rayproject/ray:multinode
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tune/... ml 
        --only-tags multinode
        --test-env=RAY_HAS_SSH=1
        --test-env=RAY_DOCKER_IMAGE=rayproject/ray:multinode
        --test-env=RAY_TEMPDIR="/ray-mount"
        --test-env=RAY_HOSTDIR="$${RAYCI_CHECKOUT_DIR}"
        --test-env=RAY_TESTHOST="rayci.localhost"
    depends_on: 
      - manylinux
      - forge
      - raycpubase
      - mlbuild
  
  - label: ":train: ml: doc tests"
    tags: 
      - train
      - tune
      - doc
    instance_type: large
    commands:
      # doc tests
      - bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... ml 
        --only-tags doctest
        --except-tags gpu
        --parallelism-per-worker 3
      # doc examples
      - bazel run //ci/ray_ci:test_in_docker -- //doc/... ml 
        --except-tags gpu,post_wheel_build,doctest,highly_parallel
        --parallelism-per-worker 3
        --skip-ray-installation
    depends_on: [ "mlbuild", "forge" ]

  - label: ":train: ml: train gpu lightning 2.0 tests"
    tags: 
      - train
      - gpu
    instance_type: gpu-large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/train/... ml 
        --build-name mllightning2gpubuild
        --only-tags ptl_v2
    depends_on: [ "mllightning2gpubuild", "forge" ]

  - label: ":train: ml: flaky tests"
    key: ml_flaky_tests
    tags: 
      - train
      - skip-on-premerge
    instance_type: large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
        --parallelism-per-worker 2
        --except-tags gpu_only,gpu,needs_credentials
    depends_on: [ "mlbuild", "forge" ]
    soft_fail: true

  - label: ":train: ml: flaky authentication tests"
    tags:
      - train
      - branch
      - skip-on-premerge
      - oss
    instance_type: medium
    commands:
      - pip install -U boto3==1.28.70 awscli==1.29.70 
      - $(python ci/env/setup_credentials.py)
      - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
        --parallelism-per-worker 3
        --only-tags needs_credentials
        --test-env=WANDB_API_KEY --test-env=COMET_API_KEY
    depends_on: [ "mlbuild", "forge" ]
    soft_fail: true

  - label: ":train: ml: train gpu flaky tests"
    key: ml_flaky_gpu_tests
    tags: 
      - train
      - skip-on-premerge
      - gpu
    instance_type: gpu-large
    commands:
      - bazel run //ci/ray_ci:test_in_docker -- //... ml --run-flaky-tests
        --parallelism-per-worker 2
        --build-name mlgpubuild
        --only-tags gpu,gpu_only
    depends_on: [ "mlgpubuild", "forge" ]
    soft_fail: true