release_tests.yaml 138 KB


  1. # Global release test configuration file.
  2. # All your release test configuration should go here. Adding release tests here
  3. # will automatically enable them in the Buildkite release testing schedules
  4. # (except they have frequency: manual).
  5. # Here is an example configuration for reference:
  6. #- name: example_test
  7. # # Tests with the same group will be grouped in the Buildkite UI
  8. # group: Example group
  9. # # Provide the working directory which will be uploaded to the cluster
  10. # working_dir: example_dir
  11. #
  12. # # How often to run the tests.
  13. # # One of [manual, any, multi, nightly, nightly-3x, weekly].
  14. # # Descriptions of each frequency (that's not immediately obvious):
  15. # # - manual: Not run on a schedule, but can be manually run through the buildkite UI.
  16. # # - nightly-3x: Run 3 times a week (Monday, Wednesday, Friday).
  17. # frequency: weekly
  18. # # Owning team. This field will be persisted to the database
  19. # team: ml
  20. #
  21. # # Python version. This optional field determines which Python version to run tests
  22. # # on. This must be a string!
  23. # python: "3.7"
  24. #
  25. # # Cluster information
  26. # cluster:
  27. # # Location of cluster env, relative to working_dir
  28. # cluster_env: cluster_env.yaml
  29. # # Location of cluster compute, relative to working_dir
  30. # cluster_compute: cluster_compute.yaml
  31. # # Autosuspend parameter passed to the cluster.
  32. # # The cluster will automatically terminate if inactive for this
  33. # # many minutes. Defaults to 10 if not set.
  34. # autosuspend_mins: 10
  35. # # Optional cloud_id to use instead of the default cloud
  36. # cloud_id: cld_12345678
  37. # # Alternatively, you can specify a cloud name
  38. # cloud_name: anyscale_default_cloud
  39. #
  40. # # Run configuration for the test
  41. # run:
  42. # # If you want to wait for nodes to be ready, you can specify this here:
  43. # wait_for_nodes:
  44. # # Number of nodes
  45. # num_nodes: 16
  46. # # Timeout for waiting for nodes. If nodes are not up by then, the
  47. # # test will fail.
  48. # timeout: 600
  49. #
  50. # # Optional prepare script to be run on the cluster before the test script
  51. # prepare: python prepare.py
  52. # # The prepare command can have a separate timeout
  53. # prepare_timeout: 300
  54. #
  55. # # Main script to run as the test script
  56. # script: python workloads/train_small.py
  57. # # Timeout in seconds. After this time the test is considered as failed.
  58. # timeout: 600
  59. #
  60. # # You can specify smoke test definitions here. If a smoke test is triggered,
  61. # # it will deep update the main test configuration with the values provided
  62. # # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
  63. # # environment variable and receive the --smoke-test flag as a parameter in the
  64. # # run script.
  65. # smoke_test:
  66. # # Smoke tests can have different frequencies. A smoke test is only triggered
  67. # # when the regular test is not matched.
  68. # frequency: nightly
  69. # # Here we adjust the run timeout down and run on less nodes. The test script
  70. # # remains the same.
  71. # run:
  72. # timeout: 300
  73. # wait_for_nodes:
  74. # num_nodes: 4
  75. # timeout: 600
  76. #
  77. # # After the test finished, this handler (in alerts/) will process the results.
  78. # # It can then let the test fail, e.g. if a metric regression is observed.
  79. # alert: default
  80. #######################
  81. # Cluster scaling tests
  82. #######################
  83. - name: cluster_tune_scale_up_down
  84. group: Cluster tests
  85. working_dir: cluster_tests
  86. frequency: nightly
  87. team: ml
  88. cluster:
  89. cluster_env: app_config.yaml
  90. cluster_compute: cpt_autoscaling_1-3_aws.yaml
  91. run:
  92. timeout: 3600
  93. script: python workloads/tune_scale_up_down.py
  94. wait_for_nodes:
  95. num_nodes: 0
  96. variations:
  97. - __suffix__: aws
  98. - __suffix__: gce
  99. env: gce
  100. frequency: manual
  101. cluster:
  102. cluster_env: app_config.yaml
  103. cluster_compute: cpt_autoscaling_1-3_gce.yaml
  104. alert: default
  105. #########################
  106. # AIR release tests
  107. #########################
  108. - name: tune_with_frequent_pausing
  109. group: AIR tests
  110. working_dir: air_tests
  111. frequency: nightly-3x
  112. team: ml
  113. cluster:
  114. cluster_env: frequent_pausing/app_config.yaml
  115. cluster_compute: frequent_pausing/compute_config_aws.yaml
  116. run:
  117. timeout: 600 # 10min
  118. long_running: true
  119. script: python frequent_pausing/script.py
  120. variations:
  121. - __suffix__: aws
  122. - __suffix__: gce
  123. env: gce
  124. frequency: manual
  125. cluster:
  126. cluster_env: frequent_pausing/app_config.yaml
  127. cluster_compute: frequent_pausing/compute_config_gce.yaml
  128. alert: default
  129. - name: long_running_horovod_tune_test
  130. group: AIR tests
  131. working_dir: air_tests
  132. frequency: weekly
  133. team: ml
  134. cluster:
  135. cluster_env: horovod/app_config_master.yaml
  136. cluster_compute: horovod/compute_tpl_aws.yaml
  137. variations:
  138. - __suffix__: aws
  139. - __suffix__: gce
  140. env: gce
  141. frequency: manual
  142. cluster:
  143. cluster_env: horovod/app_config_master.yaml
  144. cluster_compute: horovod/compute_tpl_gce.yaml
  145. run:
  146. timeout: 36000
  147. script: python horovod/workloads/horovod_tune_test.py
  148. long_running: true
  149. wait_for_nodes:
  150. num_nodes: 2
  151. smoke_test:
  152. frequency: manual
  153. run:
  154. timeout: 3600
  155. alert: default
  156. - name: air_benchmark_data_bulk_ingest
  157. group: AIR tests
  158. working_dir: air_tests/air_benchmarks
  159. frequency: nightly
  160. team: ml
  161. cluster:
  162. cluster_env: app_config.yaml
  163. cluster_compute: compute_data_20_nodes_aws.yaml
  164. run:
  165. timeout: 3600
  166. script: python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20
  167. wait_for_nodes:
  168. num_nodes: 20
  169. variations:
  170. - __suffix__: aws
  171. - __suffix__: gce
  172. env: gce
  173. frequency: manual
  174. cluster:
  175. cluster_env: app_config.yaml
  176. cluster_compute: compute_data_20_nodes_gce.yaml
  177. alert: default
  178. # AIR benchmarks for XGBoost CUJ
  179. - name: air_benchmark_xgboost_cpu_10
  180. group: AIR tests
  181. working_dir: air_tests/air_benchmarks
  182. frequency: nightly
  183. team: ml
  184. cluster:
  185. cluster_env: xgboost_app_config.yaml
  186. cluster_compute: compute_xgboost_aws.yaml
  187. run:
  188. timeout: 36000
  189. script: python workloads/xgboost_benchmark.py
  190. wait_for_nodes:
  191. num_nodes: 11
  192. variations:
  193. - __suffix__: aws
  194. - __suffix__: gce
  195. env: gce
  196. frequency: manual
  197. cluster:
  198. cluster_env: xgboost_app_config.yaml
  199. cluster_compute: compute_xgboost_gce.yaml
  200. smoke_test:
  201. frequency: manual
  202. run:
  203. timeout: 1800
  204. alert: default
  205. # Ray AIR distributed Torch benchmarks
  206. - name: air_benchmark_torch_mnist_cpu_4x1
  207. group: AIR tests
  208. working_dir: air_tests/air_benchmarks
  209. frequency: nightly
  210. team: ml
  211. cluster:
  212. cluster_env: app_config.yaml
  213. cluster_compute: compute_cpu_4_aws.yaml
  214. run:
  215. timeout: 3600
  216. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  217. wait_for_nodes:
  218. num_nodes: 4
  219. variations:
  220. - __suffix__: aws
  221. - __suffix__: gce
  222. env: gce
  223. frequency: manual
  224. cluster:
  225. cluster_env: app_config.yaml
  226. cluster_compute: compute_cpu_4_gce.yaml
  227. alert: default
  228. - name: air_benchmark_torch_mnist_gpu_4x4
  229. group: AIR tests
  230. working_dir: air_tests/air_benchmarks
  231. frequency: weekly
  232. team: ml
  233. cluster:
  234. cluster_env: app_config.yaml
  235. cluster_compute: compute_gpu_4x4_aws.yaml
  236. run:
  237. timeout: 4800
  238. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 120 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  239. wait_for_nodes:
  240. num_nodes: 4
  241. smoke_test:
  242. frequency: nightly
  243. cluster:
  244. cluster_compute: compute_gpu_2x2_aws.yaml
  245. run:
  246. timeout: 3600
  247. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  248. wait_for_nodes:
  249. num_nodes: 2
  250. variations:
  251. - __suffix__: aws
  252. - __suffix__: gce
  253. env: gce
  254. frequency: manual
  255. cluster:
  256. cluster_env: app_config.yaml
  257. cluster_compute: compute_gpu_4x4_gce.yaml
  258. smoke_test:
  259. frequency: manual
  260. alert: default
  261. - name: air_benchmark_torch_mnist_cpu_1x4
  262. group: AIR tests
  263. working_dir: air_tests/air_benchmarks
  264. frequency: nightly
  265. team: ml
  266. cluster:
  267. cluster_env: app_config.yaml
  268. cluster_compute: compute_cpu_1_aws.yaml
  269. run:
  270. timeout: 3600
  271. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  272. variations:
  273. - __suffix__: aws
  274. - __suffix__: gce
  275. env: gce
  276. frequency: manual
  277. cluster:
  278. cluster_env: app_config.yaml
  279. cluster_compute: compute_cpu_1_gce.yaml
  280. alert: default
  281. - name: air_benchmark_torch_batch_prediction_gpu_1x1_20gb
  282. group: AIR tests
  283. working_dir: air_tests/air_benchmarks
  284. frequency: nightly
  285. team: ml
  286. cluster:
  287. cluster_env: app_config.yaml
  288. cluster_compute: compute_gpu_1_cpu_16_aws.yaml
  289. run:
  290. timeout: 3600
  291. script: python workloads/gpu_batch_prediction.py --data-size-gb 20
  292. alert: default
  293. variations:
  294. - __suffix__: aws
  295. - __suffix__: gce
  296. env: gce
  297. frequency: manual
  298. cluster:
  299. cluster_env: app_config.yaml
  300. cluster_compute: compute_gpu_1_cpu_16_gce.yaml
  301. - name: air_benchmark_torch_batch_prediction_gpu_4x4_100gb
  302. group: AIR tests
  303. working_dir: air_tests/air_benchmarks
  304. frequency: nightly
  305. team: ml
  306. stable: false
  307. cluster:
  308. cluster_env: app_config.yaml
  309. cluster_compute: compute_gpu_4x4_aws.yaml
  310. run:
  311. timeout: 10800
  312. script: python workloads/gpu_batch_prediction.py --data-size-gb 100
  313. wait_for_nodes:
  314. num_nodes: 4
  315. alert: default
  316. variations:
  317. - __suffix__: aws
  318. - __suffix__: gce
  319. env: gce
  320. frequency: manual
  321. cluster:
  322. cluster_env: app_config.yaml
  323. cluster_compute: compute_gpu_4x4_gce.yaml
  324. - name: air_benchmark_torch_mnist_cpu_4x4
  325. group: AIR tests
  326. working_dir: air_tests/air_benchmarks
  327. frequency: nightly
  328. team: ml
  329. cluster:
  330. cluster_env: app_config.yaml
  331. cluster_compute: compute_cpu_4_aws.yaml
  332. run:
  333. timeout: 5400
  334. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  335. wait_for_nodes:
  336. num_nodes: 4
  337. variations:
  338. - __suffix__: aws
  339. - __suffix__: gce
  340. env: gce
  341. frequency: manual
  342. cluster:
  343. cluster_env: app_config.yaml
  344. cluster_compute: compute_cpu_4_gce.yaml
  345. alert: default
  346. - name: air_benchmark_tune_torch_mnist
  347. group: AIR tests
  348. working_dir: air_tests/air_benchmarks
  349. frequency: nightly
  350. team: ml
  351. cluster:
  352. cluster_env: app_config.yaml
  353. cluster_compute: compute_cpu_8_aws.yaml
  354. run:
  355. timeout: 3600
  356. script: python workloads/tune_torch_benchmark.py --num-runs 3 --num-trials 8 --num-workers 4
  357. wait_for_nodes:
  358. num_nodes: 8
  359. variations:
  360. - __suffix__: aws
  361. - __suffix__: gce
  362. env: gce
  363. frequency: manual
  364. cluster:
  365. cluster_env: app_config.yaml
  366. cluster_compute: compute_cpu_8_gce.yaml
  367. alert: default
  368. - name: air_benchmark_tune_torch_mnist_gpu
  369. group: AIR tests
  370. working_dir: air_tests/air_benchmarks
  371. frequency: nightly
  372. team: ml
  373. cluster:
  374. cluster_env: app_config.yaml
  375. cluster_compute: compute_gpu_4x4_aws.yaml
  376. run:
  377. timeout: 3600
  378. script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 4 --use-gpu
  379. wait_for_nodes:
  380. num_nodes: 4
  381. variations:
  382. - __suffix__: aws
  383. - __suffix__: gce
  384. env: gce
  385. frequency: manual
  386. cluster:
  387. cluster_env: app_config.yaml
  388. cluster_compute: compute_gpu_4x4_gce.yaml
  389. alert: default
  390. # Ray AIR distributed Tensorflow benchmarks
  391. - name: air_benchmark_tensorflow_mnist_cpu_4x1
  392. group: AIR tests
  393. working_dir: air_tests/air_benchmarks
  394. frequency: nightly
  395. team: ml
  396. cluster:
  397. cluster_env: app_config.yaml
  398. cluster_compute: compute_cpu_4_aws.yaml
  399. run:
  400. timeout: 5400
  401. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  402. wait_for_nodes:
  403. num_nodes: 4
  404. variations:
  405. - __suffix__: aws
  406. - __suffix__: gce
  407. env: gce
  408. frequency: manual
  409. cluster:
  410. cluster_env: app_config.yaml
  411. cluster_compute: compute_cpu_4_gce.yaml
  412. alert: default
  413. - name: air_benchmark_tensorflow_mnist_cpu_1x4
  414. group: AIR tests
  415. working_dir: air_tests/air_benchmarks
  416. frequency: nightly
  417. team: ml
  418. cluster:
  419. cluster_env: app_config.yaml
  420. cluster_compute: compute_cpu_1_aws.yaml
  421. run:
  422. timeout: 5400
  423. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  424. variations:
  425. - __suffix__: aws
  426. - __suffix__: gce
  427. env: gce
  428. frequency: manual
  429. cluster:
  430. cluster_env: app_config.yaml
  431. cluster_compute: compute_cpu_1_gce.yaml
  432. alert: default
  433. - name: air_benchmark_tensorflow_mnist_cpu_4x4
  434. group: AIR tests
  435. working_dir: air_tests/air_benchmarks
  436. frequency: nightly
  437. team: ml
  438. stable: false
  439. cluster:
  440. cluster_env: app_config.yaml
  441. cluster_compute: compute_cpu_4_aws.yaml
  442. run:
  443. timeout: 5400
  444. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  445. wait_for_nodes:
  446. num_nodes: 4
  447. variations:
  448. - __suffix__: aws
  449. - __suffix__: gce
  450. env: gce
  451. frequency: manual
  452. cluster:
  453. cluster_env: app_config.yaml
  454. cluster_compute: compute_cpu_4_gce.yaml
  455. alert: default
  456. - name: air_benchmark_tensorflow_mnist_gpu_4x4
  457. group: AIR tests
  458. working_dir: air_tests/air_benchmarks
  459. frequency: weekly
  460. team: ml
  461. stable: false
  462. cluster:
  463. cluster_env: app_config.yaml
  464. cluster_compute: compute_gpu_4x4_aws.yaml
  465. run:
  466. timeout: 5400
  467. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  468. wait_for_nodes:
  469. num_nodes: 4
  470. smoke_test:
  471. frequency: nightly
  472. cluster:
  473. cluster_compute: compute_gpu_2x2_aws.yaml
  474. run:
  475. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  476. wait_for_nodes:
  477. num_nodes: 2
  478. variations:
  479. - __suffix__: aws
  480. - __suffix__: gce
  481. env: gce
  482. frequency: manual
  483. cluster:
  484. cluster_env: app_config.yaml
  485. cluster_compute: compute_gpu_4x4_gce.yaml
  486. smoke_test:
  487. frequency: manual
  488. alert: default
  489. - name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
  490. group: AIR tests
  491. working_dir: air_tests/air_benchmarks
  492. frequency: nightly
  493. team: ml
  494. cluster:
  495. cluster_env: app_config.yaml
  496. cluster_compute: compute_gpu_1_aws.yaml
  497. run:
  498. timeout: 3600
  499. script: python workloads/pytorch_training_e2e.py --data-size-gb 20
  500. alert: default
  501. variations:
  502. - __suffix__: aws
  503. - __suffix__: gce
  504. env: gce
  505. frequency: manual
  506. cluster:
  507. cluster_env: app_config.yaml
  508. cluster_compute: compute_gpu_1_gce.yaml
  509. - name: air_benchmark_pytorch_training_e2e_gpu_4x4_100gb
  510. group: AIR tests
  511. working_dir: air_tests/air_benchmarks
  512. frequency: nightly
  513. team: ml
  514. stable: false
  515. cluster:
  516. cluster_env: app_config.yaml
  517. cluster_compute: compute_gpu_4x4_aws.yaml
  518. run:
  519. timeout: 10800
  520. script: python workloads/pytorch_training_e2e.py --data-size-gb=100 --num-workers=16
  521. wait_for_nodes:
  522. num_nodes: 4
  523. alert: default
  524. variations:
  525. - __suffix__: aws
  526. - __suffix__: gce
  527. env: gce
  528. frequency: manual
  529. cluster:
  530. cluster_env: app_config.yaml
  531. cluster_compute: compute_gpu_4x4_gce.yaml
  532. # Test tiny, medium, and huge input files.
  533. - name: ray-data-bulk-ingest-file-size-benchmark
  534. group: AIR tests
  535. working_dir: air_tests/air_benchmarks/mlperf-train
  536. stable: false
  537. jailed: true
  538. frequency: nightly
  539. team: data
  540. cluster:
  541. cluster_env: app_config_oom.yaml
  542. cluster_compute: compute_cpu_16.yaml
  543. run:
  544. timeout: 3600
  545. script: bash file_size_benchmark.sh
  546. variations:
  547. - __suffix__: aws
  548. - __suffix__: gce
  549. env: gce
  550. frequency: manual
  551. cluster:
  552. cluster_env: app_config_oom.yaml
  553. cluster_compute: compute_gce_cpu_16.yaml
  554. # Test dataset larger than object store memory.
  555. - name: ray-data-bulk-ingest-out-of-core-benchmark
  556. group: AIR tests
  557. working_dir: air_tests/air_benchmarks/mlperf-train
  558. stable: false
  559. jailed: true
  560. frequency: nightly
  561. team: data
  562. cluster:
  563. cluster_env: app_config_oom.yaml
  564. cluster_compute: compute_cpu_16.yaml
  565. run:
  566. timeout: 3600
  567. script: bash out_of_core_benchmark.sh
  568. variations:
  569. - __suffix__: aws
  570. - __suffix__: gce
  571. env: gce
  572. frequency: manual
  573. cluster:
  574. cluster_env: app_config_oom.yaml
  575. cluster_compute: compute_gce_cpu_16.yaml
  576. # Test additional CPU nodes for preprocessing.
  577. - name: ray-data-bulk-ingest-heterogeneity-benchmark
  578. group: AIR tests
  579. working_dir: air_tests/air_benchmarks/mlperf-train
  580. stable: false
  581. jailed: true
  582. frequency: nightly
  583. team: data
  584. cluster:
  585. cluster_env: app_config_oom.yaml
  586. cluster_compute: compute_cpu_16_worker_nodes_2.yaml
  587. run:
  588. wait_for_nodes:
  589. num_nodes: 3
  590. timeout: 1800
  591. script: bash heterogeneity_benchmark.sh 2
  592. variations:
  593. - __suffix__: aws
  594. - __suffix__: gce
  595. env: gce
  596. frequency: manual
  597. cluster:
  598. cluster_env: app_config_oom.yaml
  599. cluster_compute: compute_gce_cpu_16_worker_nodes_2.yaml
  600. #######################
  601. # AIR examples
  602. #######################
  603. # Test additional CPU nodes for preprocessing.
  604. - name: air_example_dreambooth_finetuning
  605. group: AIR examples
  606. working_dir: air_examples/dreambooth
  607. stable: false
  608. frequency: weekly
  609. team: ml
  610. cluster:
  611. cluster_env: dreambooth_env.yaml
  612. cluster_compute: dreambooth_compute_aws.yaml
  613. run:
  614. timeout: 1800
  615. script: bash dreambooth_run.sh
  616. artifact_path: /tmp/artifacts/example_out.jpg
  617. # variations: A10G not available on GCE, yet.
  618. - name: air_example_gptj_deepspeed_fine_tuning
  619. group: AIR examples
  620. working_dir: air_examples/gptj_deepspeed_finetuning
  621. python: "3.9"
  622. frequency: weekly
  623. team: ml
  624. cluster:
  625. cluster_env: gptj_deepspeed_env.yaml
  626. cluster_compute: gptj_deepspeed_compute_aws.yaml
  627. run:
  628. timeout: 3600
  629. script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb
  630. variations:
  631. - __suffix__: aws
  632. - __suffix__: gce
  633. env: gce
  634. frequency: manual
  635. cluster:
  636. cluster_env: gptj_deepspeed_env.yaml
  637. cluster_compute: gptj_deepspeed_compute_gce.yaml
  638. - name: air_example_dolly_v2_lightning_fsdp_finetuning
  639. group: AIR examples
  640. working_dir: air_examples/dolly_v2_lightning_fsdp_finetuning
  641. python: "3.8"
  642. frequency: weekly
  643. team: ml
  644. cluster:
  645. cluster_env: dolly_v2_fsdp_env.yaml
  646. cluster_compute: dolly_v2_fsdp_compute_aws.yaml
  647. run:
  648. timeout: 4700
  649. script: python test_myst_doc.py --path lightning-llm-finetuning-7b.ipynb
  650. - name: air_example_opt_deepspeed_batch_inference
  651. group: AIR examples
  652. working_dir: air_examples/opt_deepspeed_batch_inference
  653. python: "3.9"
  654. frequency: weekly
  655. team: ml
  656. cluster:
  657. cluster_env: 30b_deepspeed_env.yaml
  658. cluster_compute: 30b_deepspeed_compute.yaml
  659. run:
  660. timeout: 3600
  661. script: python test_myst_doc.py --path opt_deepspeed_batch_inference.ipynb
  662. # variations: TODO(jungong): add GCP variation.
  663. #####################################
  664. # Workspace templates release tests #
  665. #####################################
  666. - name: workspace_template_batch_inference
  667. group: Workspace templates
  668. working_dir: workspace_templates/01_batch_inference
  669. python: "3.9"
  670. frequency: nightly-3x
  671. team: ml
  672. cluster:
  673. cluster_env: ../testing/cluster_envs/default_cluster_env_latest_ml_py39.yaml
  674. cluster_compute: ../testing/compute_configs/gpu/aws.yaml
  675. run:
  676. timeout: 600
  677. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  678. variations:
  679. - __suffix__: aws
  680. - __suffix__: gce
  681. env: gce
  682. frequency: manual
  683. cluster:
  684. cluster_env: ../testing/cluster_envs/default_cluster_env_latest_ml_py39.yaml
  685. cluster_compute: ../testing/compute_configs/gpu/gce.yaml
  686. - name: workspace_template_many_model_training
  687. group: Workspace templates
  688. working_dir: workspace_templates/02_many_model_training
  689. python: "3.9"
  690. frequency: nightly-3x
  691. team: ml
  692. cluster:
  693. cluster_env: ../testing/cluster_envs/default_cluster_env_latest_ml_py39.yaml
  694. cluster_compute: ../testing/compute_configs/cpu/aws.yaml
  695. run:
  696. timeout: 600
  697. script: pip install --user -r requirements.txt && jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  698. variations:
  699. - __suffix__: aws
  700. - __suffix__: gce
  701. env: gce
  702. frequency: manual
  703. cluster:
  704. cluster_env: ../testing/cluster_envs/default_cluster_env_latest_ml_py39.yaml
  705. cluster_compute: ../testing/compute_configs/cpu/gce.yaml
  706. - name: workspace_template_serving_stable_diffusion
  707. group: Workspace templates
  708. working_dir: workspace_templates/03_serving_stable_diffusion
  709. python: "3.9"
  710. frequency: nightly-3x
  711. team: ml
  712. cluster:
  713. cluster_env: ../testing/cluster_envs/03_serving_stable_diffusion.yaml
  714. cluster_compute: ../testing/compute_configs/gpu/aws.yaml
  715. run:
  716. timeout: 600
  717. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  718. variations:
  719. - __suffix__: aws
  720. - __suffix__: gce
  721. env: gce
  722. frequency: manual
  723. cluster:
  724. cluster_env: ../testing/cluster_envs/03_serving_stable_diffusion.yaml
  725. cluster_compute: ../testing/compute_configs/gpu/gce.yaml
  726. #######################
  727. # XGBoost release tests
  728. #######################
  729. # It seems like the consensus is that we can deprecate this test.
  730. # - name: xgboost_train_small
  731. # group: XGBoost
  732. # working_dir: xgboost_tests
  733. # frequency: nightly
  734. # team: ml
  735. # env: staging_v2
  736. # cluster:
  737. # cluster_env: app_config.yaml
  738. # cluster_compute: tpl_cpu_small.yaml
  739. # run:
  740. # timeout: 600
  741. # script: python workloads/train_small.py
  742. # wait_for_nodes:
  743. # num_nodes: 4
  744. # alert: xgboost_tests
  745. - name: xgboost_train_moderate
  746. group: XGBoost
  747. working_dir: xgboost_tests
  748. frequency: nightly
  749. team: ml
  750. cluster:
  751. cluster_env: app_config.yaml
  752. cluster_compute: tpl_cpu_moderate_aws.yaml
  753. run:
  754. timeout: 600
  755. script: python workloads/train_moderate.py
  756. wait_for_nodes:
  757. num_nodes: 32
  758. variations:
  759. - __suffix__: aws
  760. - __suffix__: gce
  761. env: gce
  762. frequency: manual
  763. cluster:
  764. cluster_env: app_config.yaml
  765. cluster_compute: tpl_cpu_moderate_gce.yaml
  766. alert: xgboost_tests
  767. - name: xgboost_train_gpu
  768. group: XGBoost
  769. working_dir: xgboost_tests
  770. frequency: nightly
  771. team: ml
  772. cluster:
  773. cluster_env: app_config_gpu.yaml
  774. cluster_compute: tpl_gpu_small_aws.yaml
  775. run:
  776. timeout: 600
  777. script: python workloads/train_gpu.py
  778. wait_for_nodes:
  779. num_nodes: 5
  780. variations:
  781. - __suffix__: aws
  782. - __suffix__: gce
  783. env: gce
  784. frequency: manual
  785. cluster:
  786. cluster_env: app_config_gpu.yaml
  787. cluster_compute: tpl_gpu_small_gce.yaml
  788. alert: xgboost_tests
  789. - name: xgboost_distributed_api_test
  790. group: XGBoost
  791. working_dir: xgboost_tests
  792. frequency: nightly
  793. team: ml
  794. cluster:
  795. cluster_env: app_config.yaml
  796. cluster_compute: tpl_cpu_small_aws.yaml
  797. run:
  798. timeout: 600
  799. script: python workloads/distributed_api_test.py
  800. wait_for_nodes:
  801. num_nodes: 4
  802. variations:
  803. - __suffix__: aws
  804. - __suffix__: gce
  805. env: gce
  806. frequency: manual
  807. cluster:
  808. cluster_env: app_config.yaml
  809. cluster_compute: tpl_cpu_small_gce.yaml
  810. alert: default
  811. - name: xgboost_ft_small_elastic
  812. group: XGBoost
  813. working_dir: xgboost_tests
  814. frequency: nightly
  815. team: ml
  816. cluster:
  817. cluster_env: app_config.yaml
  818. cluster_compute: tpl_cpu_small_aws.yaml
  819. run:
  820. timeout: 900
  821. script: python workloads/ft_small_elastic.py
  822. wait_for_nodes:
  823. num_nodes: 4
  824. variations:
  825. - __suffix__: aws
  826. - __suffix__: gce
  827. env: gce
  828. frequency: manual
  829. cluster:
  830. cluster_env: app_config.yaml
  831. cluster_compute: tpl_cpu_small_gce.yaml
  832. alert: default
  833. - name: xgboost_ft_small_non_elastic
  834. group: XGBoost
  835. working_dir: xgboost_tests
  836. frequency: nightly
  837. team: ml
  838. cluster:
  839. cluster_env: app_config.yaml
  840. cluster_compute: tpl_cpu_small_aws.yaml
  841. run:
  842. timeout: 900
  843. script: python workloads/ft_small_non_elastic.py
  844. wait_for_nodes:
  845. num_nodes: 4
  846. variations:
  847. - __suffix__: aws
  848. - __suffix__: gce
  849. env: gce
  850. frequency: manual
  851. cluster:
  852. cluster_env: app_config.yaml
  853. cluster_compute: tpl_cpu_small_gce.yaml
  854. alert: default
  855. - name: xgboost_tune_small
  856. group: XGBoost
  857. working_dir: xgboost_tests
  858. frequency: nightly
  859. team: ml
  860. cluster:
  861. cluster_env: app_config.yaml
  862. cluster_compute: tpl_cpu_small_aws.yaml
  863. run:
  864. timeout: 600
  865. script: python workloads/tune_small.py
  866. wait_for_nodes:
  867. num_nodes: 4
  868. variations:
  869. - __suffix__: aws
  870. - __suffix__: gce
  871. env: gce
  872. frequency: manual
  873. cluster:
  874. cluster_env: app_config.yaml
  875. cluster_compute: tpl_cpu_small_gce.yaml
  876. alert: xgboost_tests
  877. - name: xgboost_tune_32x4
  878. group: XGBoost
  879. working_dir: xgboost_tests
  880. frequency: nightly
  881. team: ml
  882. cluster:
  883. cluster_env: app_config.yaml
  884. cluster_compute: tpl_cpu_moderate_aws.yaml
  885. run:
  886. timeout: 900
  887. script: python workloads/tune_32x4.py
  888. wait_for_nodes:
  889. num_nodes: 32
  890. variations:
  891. - __suffix__: aws
  892. - __suffix__: gce
  893. env: gce
  894. frequency: manual
  895. cluster:
  896. cluster_env: app_config.yaml
  897. cluster_compute: tpl_cpu_moderate_gce.yaml
  898. alert: xgboost_tests
  899. - name: xgboost_tune_4x32
  900. group: XGBoost
  901. working_dir: xgboost_tests
  902. frequency: nightly
  903. team: ml
  904. cluster:
  905. cluster_env: app_config.yaml
  906. cluster_compute: tpl_cpu_moderate_aws.yaml
  907. run:
  908. timeout: 900
  909. script: python workloads/tune_4x32.py
  910. wait_for_nodes:
  911. num_nodes: 32
  912. variations:
  913. - __suffix__: aws
  914. - __suffix__: gce
  915. env: gce
  916. frequency: manual
  917. cluster:
  918. cluster_env: app_config.yaml
  919. cluster_compute: tpl_cpu_moderate_gce.yaml
  920. alert: xgboost_tests
  921. #######################
  922. # LightGBM tests
  923. #######################
  924. # It seems like the consensus is that we can deprecate this test.
  925. # - name: lightgbm_train_small
  926. # group: LightGBM tests
  927. # working_dir: lightgbm_tests
  928. # frequency: nightly
  929. # team: ml
  930. # env: staging_v2
  931. # cluster:
  932. # cluster_env: app_config.yaml
  933. # cluster_compute: tpl_cpu_small.yaml
  934. # run:
  935. # timeout: 600
  936. # script: python workloads/train_small.py
  937. # wait_for_nodes:
  938. # num_nodes: 4
  939. # alert: default
  940. - name: lightgbm_train_moderate
  941. group: LightGBM tests
  942. working_dir: lightgbm_tests
  943. frequency: nightly
  944. team: ml
  945. cluster:
  946. cluster_env: app_config.yaml
  947. cluster_compute: tpl_cpu_moderate_aws.yaml
  948. run:
  949. timeout: 600
  950. script: python workloads/train_moderate.py
  951. wait_for_nodes:
  952. num_nodes: 32
  953. variations:
  954. - __suffix__: aws
  955. - __suffix__: gce
  956. env: gce
  957. frequency: manual
  958. cluster:
  959. cluster_env: app_config.yaml
  960. cluster_compute: tpl_cpu_moderate_gce.yaml
  961. alert: default
  962. - name: lightgbm_distributed_api_test
  963. group: LightGBM tests
  964. working_dir: lightgbm_tests
  965. frequency: nightly
  966. team: ml
  967. cluster:
  968. cluster_env: app_config.yaml
  969. cluster_compute: tpl_cpu_small_aws.yaml
  970. run:
  971. timeout: 600
  972. script: python workloads/distributed_api_test.py
  973. wait_for_nodes:
  974. num_nodes: 4
  975. variations:
  976. - __suffix__: aws
  977. - __suffix__: gce
  978. env: gce
  979. frequency: manual
  980. cluster:
  981. cluster_env: app_config.yaml
  982. cluster_compute: tpl_cpu_small_gce.yaml
  983. alert: default
  984. - name: lightgbm_ft_small_non_elastic
  985. group: LightGBM tests
  986. working_dir: lightgbm_tests
  987. frequency: nightly
  988. team: ml
  989. cluster:
  990. cluster_env: app_config.yaml
  991. cluster_compute: tpl_cpu_small_aws.yaml
  992. run:
  993. timeout: 900
  994. script: python workloads/ft_small_non_elastic.py
  995. wait_for_nodes:
  996. num_nodes: 4
  997. variations:
  998. - __suffix__: aws
  999. - __suffix__: gce
  1000. env: gce
  1001. frequency: manual
  1002. cluster:
  1003. cluster_env: app_config.yaml
  1004. cluster_compute: tpl_cpu_small_gce.yaml
  1005. alert: default
  1006. - name: lightgbm_tune_small
  1007. group: LightGBM tests
  1008. working_dir: lightgbm_tests
  1009. frequency: nightly
  1010. team: ml
  1011. cluster:
  1012. cluster_env: app_config.yaml
  1013. cluster_compute: tpl_cpu_small_aws.yaml
  1014. run:
  1015. timeout: 600
  1016. script: python workloads/tune_small.py
  1017. wait_for_nodes:
  1018. num_nodes: 4
  1019. variations:
  1020. - __suffix__: aws
  1021. - __suffix__: gce
  1022. env: gce
  1023. frequency: manual
  1024. cluster:
  1025. cluster_env: app_config.yaml
  1026. cluster_compute: tpl_cpu_small_gce.yaml
  1027. alert: default
  1028. - name: lightgbm_tune_16x4
  1029. group: LightGBM tests
  1030. working_dir: lightgbm_tests
  1031. frequency: nightly
  1032. team: ml
  1033. cluster:
  1034. cluster_env: app_config.yaml
  1035. cluster_compute: tpl_cpu_moderate_aws.yaml
  1036. run:
  1037. timeout: 900
  1038. script: python workloads/tune_16x4.py
  1039. wait_for_nodes:
  1040. num_nodes: 32
  1041. variations:
  1042. - __suffix__: aws
  1043. - __suffix__: gce
  1044. env: gce
  1045. frequency: manual
  1046. cluster:
  1047. cluster_env: app_config.yaml
  1048. cluster_compute: tpl_cpu_moderate_gce.yaml
  1049. alert: default
  1050. - name: lightgbm_tune_4x16
  1051. group: LightGBM tests
  1052. working_dir: lightgbm_tests
  1053. frequency: nightly
  1054. team: ml
  1055. cluster:
  1056. cluster_env: app_config.yaml
  1057. cluster_compute: tpl_cpu_moderate_aws.yaml
  1058. run:
  1059. timeout: 900
  1060. script: python workloads/tune_4x16.py
  1061. wait_for_nodes:
  1062. num_nodes: 32
  1063. variations:
  1064. - __suffix__: aws
  1065. - __suffix__: gce
  1066. env: gce
  1067. frequency: manual
  1068. cluster:
  1069. cluster_env: app_config.yaml
  1070. cluster_compute: tpl_cpu_moderate_gce.yaml
  1071. alert: default
  1072. #######################
  1073. # Lightning tests
  1074. #######################
  1075. # Naming convention: lightning_{accelerator}_{mode}_{#cpu}_{#gpu}
  1076. - name: lightning_gpu_train_3x16_3x1
  1077. group: Lightning tests
  1078. working_dir: lightning_tests
  1079. frequency: nightly-3x
  1080. team: ml
  1081. cluster:
  1082. cluster_env: app_config.yaml
  1083. cluster_compute: compute_tpl_aws.yaml
  1084. run:
  1085. timeout: 1200
  1086. script: python workloads/test_trainer.py
  1087. wait_for_nodes:
  1088. num_nodes: 3
  1089. variations:
  1090. - __suffix__: aws
  1091. - __suffix__: gce
  1092. env: gce
  1093. frequency: manual
  1094. cluster:
  1095. cluster_env: app_config.yaml
  1096. cluster_compute: compute_tpl_gce.yaml
  1097. alert: default
  1098. - name: lightning_gpu_tune_3x16_3x1
  1099. group: Lightning tests
  1100. working_dir: lightning_tests
  1101. frequency: nightly-3x
  1102. team: ml
  1103. cluster:
  1104. cluster_env: app_config.yaml
  1105. cluster_compute: compute_tpl_aws.yaml
  1106. run:
  1107. timeout: 1200
  1108. script: python workloads/test_tuner.py
  1109. wait_for_nodes:
  1110. num_nodes: 3
  1111. variations:
  1112. - __suffix__: aws
  1113. - __suffix__: gce
  1114. env: gce
  1115. frequency: manual
  1116. cluster:
  1117. cluster_env: app_config.yaml
  1118. cluster_compute: compute_tpl_gce.yaml
  1119. alert: default
  1120. #######################
  1121. # ML user tests
  1122. #######################
  1123. - name: ml_user_horovod_user_test_latest
  1124. group: ML user tests
  1125. working_dir: ml_user_tests
  1126. frequency: nightly-3x
  1127. team: ml
  1128. cluster:
  1129. cluster_env: horovod/app_config.yaml
  1130. cluster_compute: horovod/compute_tpl_aws.yaml
  1131. run:
  1132. timeout: 1200
  1133. script: python horovod/horovod_user_test.py
  1134. wait_for_nodes:
  1135. num_nodes: 4
  1136. variations:
  1137. - __suffix__: aws
  1138. - __suffix__: gce
  1139. env: gce
  1140. frequency: manual
  1141. cluster:
  1142. cluster_env: horovod/app_config.yaml
  1143. cluster_compute: horovod/compute_tpl_gce.yaml
  1144. alert: default
  1145. - name: ml_user_horovod_user_test_master
  1146. group: ML user tests
  1147. working_dir: ml_user_tests
  1148. frequency: nightly-3x
  1149. team: ml
  1150. cluster:
  1151. cluster_env: horovod/app_config_master.yaml
  1152. cluster_compute: horovod/compute_tpl_aws.yaml
  1153. run:
  1154. timeout: 1200
  1155. script: python horovod/horovod_user_test.py
  1156. wait_for_nodes:
  1157. num_nodes: 4
  1158. variations:
  1159. - __suffix__: aws
  1160. - __suffix__: gce
  1161. env: gce
  1162. frequency: manual
  1163. cluster:
  1164. cluster_env: horovod/app_config_master.yaml
  1165. cluster_compute: horovod/compute_tpl_gce.yaml
  1166. alert: default
  1167. - name: ml_user_train_tensorflow_mnist_test
  1168. group: ML user tests
  1169. working_dir: ml_user_tests
  1170. frequency: nightly-3x
  1171. team: ml
  1172. cluster:
  1173. cluster_env: train/app_config.yaml
  1174. cluster_compute: train/compute_tpl_aws.yaml
  1175. run:
  1176. timeout: 36000
  1177. script: python train/train_tensorflow_mnist_test.py
  1178. wait_for_nodes:
  1179. num_nodes: 3
  1180. variations:
  1181. - __suffix__: aws
  1182. - __suffix__: gce
  1183. env: gce
  1184. frequency: manual
  1185. cluster:
  1186. cluster_env: train/app_config.yaml
  1187. cluster_compute: train/compute_tpl_gce.yaml
  1188. alert: default
  1189. - name: ml_user_train_torch_linear_test
  1190. group: ML user tests
  1191. working_dir: ml_user_tests
  1192. frequency: nightly-3x
  1193. team: ml
  1194. cluster:
  1195. cluster_env: train/app_config.yaml
  1196. cluster_compute: train/compute_tpl_aws.yaml
  1197. run:
  1198. timeout: 36000
  1199. script: python train/train_torch_linear_test.py
  1200. wait_for_nodes:
  1201. num_nodes: 3
  1202. variations:
  1203. - __suffix__: aws
  1204. - __suffix__: gce
  1205. env: gce
  1206. frequency: manual
  1207. cluster:
  1208. cluster_env: train/app_config.yaml
  1209. cluster_compute: train/compute_tpl_gce.yaml
  1210. alert: default
  1211. - name: ml_user_xgboost_gpu_connect_latest
  1212. group: ML user tests
  1213. working_dir: ml_user_tests
  1214. frequency: nightly-3x
  1215. team: ml
  1216. cluster:
  1217. cluster_env: xgboost/app_config_gpu.yaml
  1218. cluster_compute: xgboost/tpl_gpu_small_scaling_aws.yaml
  1219. run:
  1220. timeout: 1200
  1221. script: python xgboost/train_gpu_connect.py
  1222. wait_for_nodes:
  1223. num_nodes: 5
  1224. variations:
  1225. - __suffix__: aws
  1226. - __suffix__: gce
  1227. env: gce
  1228. frequency: manual
  1229. cluster:
  1230. cluster_env: xgboost/app_config_gpu.yaml
  1231. cluster_compute: xgboost/tpl_gpu_small_scaling_gce.yaml
  1232. alert: default
  1233. - name: ml_user_xgboost_gpu_connect_master
  1234. group: ML user tests
  1235. working_dir: ml_user_tests
  1236. frequency: nightly-3x
  1237. team: ml
  1238. cluster:
  1239. cluster_env: xgboost/app_config_gpu_master.yaml
  1240. cluster_compute: xgboost/tpl_gpu_small_scaling_aws.yaml
  1241. run:
  1242. timeout: 1200
  1243. script: python xgboost/train_gpu_connect.py
  1244. wait_for_nodes:
  1245. num_nodes: 5
  1246. variations:
  1247. - __suffix__: aws
  1248. - __suffix__: gce
  1249. env: gce
  1250. frequency: manual
  1251. cluster:
  1252. cluster_env: xgboost/app_config_gpu_master.yaml
  1253. cluster_compute: xgboost/tpl_gpu_small_scaling_gce.yaml
  1254. alert: default
  1255. - name: ml_user_ray_lightning_user_test_latest
  1256. group: ML user tests
  1257. working_dir: ml_user_tests
  1258. frequency: nightly-3x
  1259. team: ml
  1260. cluster:
  1261. cluster_env: ray-lightning/app_config.yaml
  1262. cluster_compute: ray-lightning/compute_tpl_aws.yaml
  1263. run:
  1264. timeout: 1200
  1265. script: python ray-lightning/ray_lightning_user_test.py
  1266. wait_for_nodes:
  1267. num_nodes: 3
  1268. variations:
  1269. - __suffix__: aws
  1270. - __suffix__: gce
  1271. env: gce
  1272. frequency: manual
  1273. cluster:
  1274. cluster_env: ray-lightning/app_config.yaml
  1275. cluster_compute: ray-lightning/compute_tpl_gce.yaml
  1276. alert: default
  1277. - name: ml_user_ray_lightning_user_test_master
  1278. group: ML user tests
  1279. working_dir: ml_user_tests
  1280. frequency: nightly-3x
  1281. team: ml
  1282. cluster:
  1283. cluster_env: ray-lightning/app_config_master.yaml
  1284. cluster_compute: ray-lightning/compute_tpl_aws.yaml
  1285. run:
  1286. timeout: 1200
  1287. script: python ray-lightning/ray_lightning_user_test.py
  1288. wait_for_nodes:
  1289. num_nodes: 3
  1290. variations:
  1291. - __suffix__: aws
  1292. - __suffix__: gce
  1293. env: gce
  1294. frequency: manual
  1295. cluster:
  1296. cluster_env: ray-lightning/app_config_master.yaml
  1297. cluster_compute: ray-lightning/compute_tpl_gce.yaml
  1298. alert: default
  1299. - name: ml_user_tune_rllib_connect_test
  1300. group: ML user tests
  1301. working_dir: ml_user_tests
  1302. frequency: nightly-3x
  1303. team: ml
  1304. cluster:
  1305. cluster_env: ../rllib_tests/app_config.yaml
  1306. cluster_compute: tune_rllib/compute_tpl_aws.yaml
  1307. run:
  1308. timeout: 2000
  1309. script: python tune_rllib/run_connect_tests.py
  1310. wait_for_nodes:
  1311. num_nodes: 9
  1312. variations:
  1313. - __suffix__: aws
  1314. - __suffix__: gce
  1315. env: gce
  1316. frequency: manual
  1317. cluster:
  1318. cluster_env: ../rllib_tests/app_config.yaml
  1319. cluster_compute: tune_rllib/compute_tpl_gce.yaml
  1320. alert: default
  1321. #######################
  1322. # Tune cloud tests
  1323. #######################
  1324. - name: tune_cloud_no_sync_down
  1325. group: Tune cloud tests
  1326. working_dir: tune_tests/cloud_tests
  1327. frequency: nightly
  1328. team: ml
  1329. cluster:
  1330. cluster_env: app_config.yaml
  1331. cluster_compute: tpl_aws_4x2.yaml
  1332. run:
  1333. timeout: 600
  1334. script: python workloads/run_cloud_test.py no_sync_down
  1335. wait_for_nodes:
  1336. num_nodes: 4
  1337. variations:
  1338. - __suffix__: aws
  1339. - __suffix__: gce
  1340. env: gce
  1341. frequency: manual
  1342. cluster:
  1343. cluster_compute: tpl_gce_4x8.yaml
  1344. alert: tune_tests
  1345. - name: tune_cloud_ssh_sync
  1346. group: Tune cloud tests
  1347. working_dir: tune_tests/cloud_tests
  1348. frequency: nightly
  1349. team: ml
  1350. cluster:
  1351. cluster_env: app_config.yaml
  1352. cluster_compute: tpl_aws_4x2.yaml
  1353. run:
  1354. timeout: 600
  1355. script: python workloads/run_cloud_test.py ssh_sync
  1356. wait_for_nodes:
  1357. num_nodes: 4
  1358. variations:
  1359. - __suffix__: aws
  1360. - __suffix__: gce
  1361. env: gce
  1362. frequency: manual
  1363. cluster:
  1364. cluster_env: app_config.yaml
  1365. cluster_compute: tpl_gce_4x8.yaml
  1366. alert: tune_tests
  1367. - name: tune_cloud_durable_upload
  1368. group: Tune cloud tests
  1369. working_dir: tune_tests/cloud_tests
  1370. frequency: nightly
  1371. team: ml
  1372. cluster:
  1373. cluster_env: app_config.yaml
  1374. cluster_compute: tpl_aws_4x2.yaml
  1375. run:
  1376. timeout: 600
  1377. script: python workloads/run_cloud_test.py durable_upload --bucket s3://tune-cloud-tests/durable_upload
  1378. wait_for_nodes:
  1379. num_nodes: 4
  1380. variations:
  1381. - __suffix__: aws
  1382. - __suffix__: gce
  1383. env: gce
  1384. frequency: manual
  1385. cluster:
  1386. cluster_env: app_config.yaml
  1387. cluster_compute: tpl_gce_4x8.yaml
  1388. run:
  1389. timeout: 600
  1390. script: python workloads/run_cloud_test.py durable_upload --bucket gs://tune-cloud-tests/durable_upload
  1391. wait_for_nodes:
  1392. num_nodes: 4
  1393. alert: tune_tests
  1394. - name: tune_cloud_durable_upload_rllib_str
  1395. group: Tune cloud tests
  1396. working_dir: tune_tests/cloud_tests
  1397. stable: false
  1398. frequency: nightly
  1399. team: ml
  1400. cluster:
  1401. cluster_env: app_config_ml.yaml
  1402. cluster_compute: tpl_aws_4x2.yaml
  1403. run:
  1404. timeout: 600
  1405. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
  1406. --bucket s3://tune-cloud-tests/durable_upload_rllib_str
  1407. wait_for_nodes:
  1408. num_nodes: 4
  1409. variations:
  1410. - __suffix__: aws
  1411. - __suffix__: gce
  1412. env: gce
  1413. frequency: manual
  1414. cluster:
  1415. cluster_env: app_config_ml.yaml
  1416. cluster_compute: tpl_gce_4x2.yaml
  1417. run:
  1418. timeout: 600
  1419. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
  1420. --bucket gs://tune-cloud-tests/durable_upload_rllib_str
  1421. wait_for_nodes:
  1422. num_nodes: 4
  1423. alert: tune_tests
  1424. - name: tune_cloud_durable_upload_rllib_trainer
  1425. group: Tune cloud tests
  1426. working_dir: tune_tests/cloud_tests
  1427. stable: false
  1428. frequency: nightly
  1429. team: ml
  1430. cluster:
  1431. cluster_env: app_config_ml.yaml
  1432. cluster_compute: tpl_aws_4x2.yaml
  1433. run:
  1434. timeout: 600
  1435. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer
  1436. --bucket s3://tune-cloud-tests/durable_upload_rllib_trainer
  1437. wait_for_nodes:
  1438. num_nodes: 4
  1439. variations:
  1440. - __suffix__: aws
  1441. - __suffix__: gce
  1442. env: gce
  1443. frequency: manual
  1444. cluster:
  1445. cluster_env: app_config_ml.yaml
  1446. cluster_compute: tpl_gce_4x2.yaml
  1447. run:
  1448. timeout: 600
  1449. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
  1450. --bucket gs://tune-cloud-tests/durable_upload_rllib_trainer
  1451. wait_for_nodes:
  1452. num_nodes: 4
  1453. alert: tune_tests
  1454. ########################
  1455. # Tune scalability tests
  1456. ########################
  1457. - name: tune_scalability_bookkeeping_overhead
  1458. group: Tune scalability tests
  1459. working_dir: tune_tests/scalability_tests
  1460. frequency: nightly
  1461. team: ml
  1462. cluster:
  1463. cluster_env: app_config.yaml
  1464. cluster_compute: tpl_1x16.yaml
  1465. run:
  1466. timeout: 1200
  1467. script: python workloads/test_bookkeeping_overhead.py
  1468. alert: tune_tests
  1469. variations:
  1470. - __suffix__: aws
  1471. - __suffix__: gce
  1472. env: gce
  1473. frequency: manual
  1474. cluster:
  1475. cluster_env: app_config.yaml
  1476. cluster_compute: tpl_gce_1x16.yaml
  1477. - name: tune_scalability_durable_trainable
  1478. group: Tune scalability tests
  1479. working_dir: tune_tests/scalability_tests
  1480. frequency: nightly
  1481. team: ml
  1482. cluster:
  1483. cluster_env: app_config.yaml
  1484. cluster_compute: tpl_16x2.yaml
  1485. run:
  1486. timeout: 900
  1487. script: python workloads/test_durable_trainable.py --bucket s3://tune-cloud-tests/scalability_durable_trainable
  1488. wait_for_nodes:
  1489. num_nodes: 16
  1490. variations:
  1491. - __suffix__: aws
  1492. - __suffix__: gce
  1493. env: gce
  1494. frequency: manual
  1495. run:
  1496. timeout: 900
  1497. script: python workloads/test_durable_trainable.py --bucket gs://tune-cloud-tests/scalability_durable_trainable
  1498. wait_for_nodes:
  1499. num_nodes: 16
  1500. cluster:
  1501. cluster_env: app_config.yaml
  1502. cluster_compute: tpl_gce_16x2.yaml
  1503. alert: tune_tests
  1504. - name: tune_scalability_durable_multifile_checkpoints
  1505. group: Tune scalability tests
  1506. working_dir: tune_tests/scalability_tests
  1507. frequency: nightly
  1508. team: ml
  1509. cluster:
  1510. cluster_env: app_config.yaml
  1511. cluster_compute: tpl_16x2.yaml
  1512. run:
  1513. timeout: 900
  1514. script: python workloads/test_durable_multifile_checkpoints.py --bucket s3://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1515. wait_for_nodes:
  1516. num_nodes: 16
  1517. variations:
  1518. - __suffix__: aws
  1519. - __suffix__: gce
  1520. env: gce
  1521. frequency: manual
  1522. run:
  1523. timeout: 900
  1524. script: python workloads/test_durable_multifile_checkpoints.py --bucket gs://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1525. wait_for_nodes:
  1526. num_nodes: 16
  1527. cluster:
  1528. cluster_env: app_config.yaml
  1529. cluster_compute: tpl_gce_16x2.yaml
  1530. alert: tune_tests
  1531. - name: tune_scalability_long_running_large_checkpoints
  1532. group: Tune scalability tests
  1533. working_dir: tune_tests/scalability_tests
  1534. frequency: weekly
  1535. team: ml
  1536. cluster:
  1537. cluster_env: app_config.yaml
  1538. cluster_compute: tpl_1x32_hd.yaml
  1539. run:
  1540. timeout: 86400
  1541. script: python workloads/test_long_running_large_checkpoints.py
  1542. long_running: true
  1543. smoke_test:
  1544. frequency: nightly
  1545. run:
  1546. timeout: 3600
  1547. alert: tune_tests
  1548. variations:
  1549. - __suffix__: aws
  1550. - __suffix__: gce
  1551. env: gce
  1552. frequency: manual
  1553. smoke_test:
  1554. frequency: manual
  1555. cluster:
  1556. cluster_env: app_config.yaml
  1557. cluster_compute: tpl_gce_1x32_hd.yaml
  1558. - name: tune_scalability_network_overhead
  1559. group: Tune scalability tests
  1560. working_dir: tune_tests/scalability_tests
  1561. frequency: weekly
  1562. team: ml
  1563. cluster:
  1564. cluster_env: app_config.yaml
  1565. cluster_compute: tpl_100x2.yaml
  1566. run:
  1567. timeout: 900
  1568. prepare_timeout: 1200
  1569. script: python workloads/test_network_overhead.py
  1570. wait_for_nodes:
  1571. num_nodes: 100
  1572. alert: tune_tests
  1573. variations:
  1574. - __suffix__: aws
  1575. - __suffix__: smoke-test
  1576. frequency: nightly
  1577. cluster:
  1578. cluster_env: app_config.yaml
  1579. cluster_compute: tpl_20x2.yaml
  1580. run:
  1581. timeout: 500
  1582. prepare_timeout: 600
  1583. script: python workloads/test_network_overhead.py --smoke-test
  1584. wait_for_nodes:
  1585. num_nodes: 20
  1586. - __suffix__: gce
  1587. env: gce
  1588. frequency: manual
  1589. cluster:
  1590. cluster_env: app_config.yaml
  1591. cluster_compute: tpl_gce_100x2.yaml
  1592. - name: tune_scalability_result_throughput_cluster
  1593. group: Tune scalability tests
  1594. working_dir: tune_tests/scalability_tests
  1595. frequency: nightly-3x
  1596. team: ml
  1597. cluster:
  1598. cluster_env: app_config.yaml
  1599. cluster_compute: tpl_16x64.yaml
  1600. run:
  1601. timeout: 600
  1602. script: python workloads/test_result_throughput_cluster.py
  1603. wait_for_nodes:
  1604. num_nodes: 16
  1605. alert: tune_tests
  1606. variations:
  1607. - __suffix__: aws
  1608. - __suffix__: gce
  1609. env: gce
  1610. frequency: manual
  1611. cluster:
  1612. cluster_env: app_config.yaml
  1613. cluster_compute: tpl_gce_16x64.yaml
  1614. - name: tune_scalability_result_throughput_single_node
  1615. group: Tune scalability tests
  1616. working_dir: tune_tests/scalability_tests
  1617. frequency: nightly
  1618. team: ml
  1619. cluster:
  1620. cluster_env: app_config.yaml
  1621. cluster_compute: tpl_1x96.yaml
  1622. run:
  1623. timeout: 600
  1624. script: python workloads/test_result_throughput_single_node.py
  1625. alert: tune_tests
  1626. variations:
  1627. - __suffix__: aws
  1628. - __suffix__: gce
  1629. env: gce
  1630. frequency: manual
  1631. cluster:
  1632. cluster_env: app_config.yaml
  1633. cluster_compute: tpl_gce_1x96.yaml
  1634. run:
  1635. timeout: 600
  1636. script: python workloads/test_result_throughput_single_node.py
  1637. - name: tune_scalability_xgboost_sweep
  1638. group: Tune scalability tests
  1639. working_dir: tune_tests/scalability_tests
  1640. frequency: weekly
  1641. team: ml
  1642. cluster:
  1643. cluster_env: app_config_data.yaml
  1644. cluster_compute: tpl_16x64.yaml
  1645. run:
  1646. timeout: 3600
  1647. script: python workloads/test_xgboost_sweep.py
  1648. wait_for_nodes:
  1649. num_nodes: 16
  1650. alert: tune_tests
  1651. variations:
  1652. - __suffix__: aws
  1653. - __suffix__: gce
  1654. env: gce
  1655. frequency: manual
  1656. cluster:
  1657. cluster_env: app_config_data.yaml
  1658. cluster_compute: tpl_gce_16x64.yaml
  1659. ############################
  1660. # Tune fault tolerance tests
  1661. ############################
  1662. - name: tune_worker_fault_tolerance
  1663. group: Tune fault tolerance tests
  1664. working_dir: tune_tests/fault_tolerance_tests
  1665. stable: true
  1666. frequency: nightly-3x
  1667. team: ml
  1668. cluster:
  1669. cluster_env: app_config.yaml
  1670. cluster_compute: tpl_aws_16x1.yaml
  1671. run:
  1672. timeout: 5400
  1673. script: python workloads/test_tune_worker_fault_tolerance.py --bucket s3://tune-cloud-tests/worker_fault_tolerance
  1674. wait_for_nodes:
  1675. num_nodes: 16
  1676. # Disabled until we can kill nodes in GCE
  1677. # variations:
  1678. # - __suffix__: aws
  1679. # - __suffix__: gce
  1680. # env: gce
  1681. # frequency: manual
  1682. # run:
  1683. # timeout: 5400
  1684. # script: python workloads/test_tune_worker_fault_tolerance.py --bucket gs://tune-cloud-tests/worker_fault_tolerance
  1685. #
  1686. # wait_for_nodes:
  1687. # num_nodes: 16
  1688. # cluster:
  1689. # cluster_env: app_config.yaml
  1690. # cluster_compute: tpl_gce_16x1.yaml
  1691. ########################
  1692. # Golden Notebook tests
  1693. ########################
  1694. - name: golden_notebook_torch_tune_serve_test
  1695. group: Golden Notebook tests
  1696. working_dir: golden_notebook_tests
  1697. frequency: nightly-3x
  1698. team: ml
  1699. cluster:
  1700. cluster_env: torch_tune_serve_app_config.yaml
  1701. cluster_compute: gpu_tpl_aws.yaml
  1702. run:
  1703. timeout: 600
  1704. script: python workloads/torch_tune_serve_test.py
  1705. wait_for_nodes:
  1706. num_nodes: 2
  1707. variations:
  1708. - __suffix__: aws
  1709. - __suffix__: gce
  1710. env: gce
  1711. frequency: manual
  1712. cluster:
  1713. cluster_env: torch_tune_serve_app_config.yaml
  1714. cluster_compute: gpu_tpl_gce.yaml
  1715. alert: default
  1716. #######################
  1717. # Long running tests
  1718. #######################
  1719. - name: long_running_actor_deaths
  1720. group: Long running tests
  1721. working_dir: long_running_tests
  1722. frequency: weekly
  1723. python: "3.8"
  1724. team: core
  1725. cluster:
  1726. byod:
  1727. runtime_env:
  1728. - RLLIB_TEST_NO_JAX_IMPORT=1
  1729. cluster_env: app_config.yaml
  1730. cluster_compute: tpl_cpu_1.yaml
  1731. run:
  1732. timeout: 86400
  1733. script: python workloads/actor_deaths.py
  1734. long_running: true
  1735. smoke_test:
  1736. frequency: nightly
  1737. run:
  1738. timeout: 3600
  1739. alert: long_running_tests
  1740. variations:
  1741. - __suffix__: aws
  1742. - __suffix__: gce
  1743. env: gce
  1744. frequency: manual
  1745. smoke_test:
  1746. frequency: manual
  1747. cluster:
  1748. cluster_env: app_config.yaml
  1749. cluster_compute: tpl_cpu_1_gce.yaml
  1750. - name: long_running_apex
  1751. group: Long running tests
  1752. working_dir: long_running_tests
  1753. frequency: weekly
  1754. team: rllib
  1755. cluster:
  1756. cluster_env: ../rllib_tests/app_config.yaml
  1757. cluster_compute: tpl_cpu_3.yaml
  1758. run:
  1759. timeout: 86400
  1760. script: python workloads/apex.py
  1761. long_running: true
  1762. wait_for_nodes:
  1763. num_nodes: 3
  1764. smoke_test:
  1765. frequency: nightly
  1766. run:
  1767. timeout: 3600
  1768. alert: long_running_tests
  1769. variations:
  1770. - __suffix__: aws
  1771. - __suffix__: gce
  1772. env: gce
  1773. frequency: manual
  1774. smoke_test:
  1775. frequency: manual
  1776. run:
  1777. timeout: 3600
  1778. cluster:
  1779. cluster_env: ../rllib_tests/app_config.yaml
  1780. cluster_compute: tpl_cpu_3_gce.yaml
  1781. - name: long_running_impala
  1782. group: Long running tests
  1783. working_dir: long_running_tests
  1784. frequency: weekly
  1785. team: rllib
  1786. cluster:
  1787. cluster_env: ../rllib_tests/app_config.yaml
  1788. cluster_compute: tpl_cpu_1_large.yaml
  1789. run:
  1790. timeout: 86400
  1791. script: python workloads/impala.py
  1792. long_running: true
  1793. smoke_test:
  1794. frequency: nightly
  1795. run:
  1796. timeout: 3600
  1797. alert: long_running_tests
  1798. variations:
  1799. - __suffix__: aws
  1800. - __suffix__: gce
  1801. env: gce
  1802. frequency: manual
  1803. smoke_test:
  1804. frequency: manual
  1805. run:
  1806. timeout: 3600
  1807. cluster:
  1808. cluster_env: ../rllib_tests/app_config.yaml
  1809. cluster_compute: tpl_cpu_1_large_gce.yaml
  1810. - name: long_running_many_actor_tasks
  1811. group: Long running tests
  1812. working_dir: long_running_tests
  1813. frequency: weekly
  1814. python: "3.8"
  1815. team: core
  1816. cluster:
  1817. byod:
  1818. runtime_env:
  1819. - RLLIB_TEST_NO_JAX_IMPORT=1
  1820. cluster_env: app_config.yaml
  1821. cluster_compute: tpl_cpu_1.yaml
  1822. run:
  1823. timeout: 86400
  1824. script: python workloads/many_actor_tasks.py
  1825. long_running: true
  1826. smoke_test:
  1827. frequency: nightly
  1828. run:
  1829. timeout: 3600
  1830. alert: long_running_tests
  1831. variations:
  1832. - __suffix__: aws
  1833. - __suffix__: gce
  1834. env: gce
  1835. frequency: manual
  1836. smoke_test:
  1837. frequency: manual
  1838. run:
  1839. timeout: 3600
  1840. cluster:
  1841. cluster_env: app_config.yaml
  1842. cluster_compute: tpl_cpu_1_gce.yaml
  1843. - name: long_running_many_drivers
  1844. group: Long running tests
  1845. working_dir: long_running_tests
  1846. frequency: weekly
  1847. python: "3.8"
  1848. team: core
  1849. cluster:
  1850. byod:
  1851. runtime_env:
  1852. - RLLIB_TEST_NO_JAX_IMPORT=1
  1853. cluster_env: app_config.yaml
  1854. cluster_compute: tpl_cpu_1.yaml
  1855. run:
  1856. timeout: 86400
  1857. script: python workloads/many_drivers.py --iteration-num=4000
  1858. long_running: true
  1859. smoke_test:
  1860. frequency: nightly
  1861. run:
  1862. timeout: 3600
  1863. alert: long_running_tests
  1864. variations:
  1865. - __suffix__: aws
  1866. - __suffix__: gce
  1867. env: gce
  1868. frequency: manual
  1869. smoke_test:
  1870. frequency: manual
  1871. run:
  1872. timeout: 3600
  1873. cluster:
  1874. cluster_env: app_config.yaml
  1875. cluster_compute: tpl_cpu_1_gce.yaml
  1876. - name: long_running_many_ppo
  1877. group: Long running tests
  1878. working_dir: long_running_tests
  1879. stable: false
  1880. frequency: weekly
  1881. team: ml
  1882. cluster:
  1883. cluster_env: ../rllib_tests/app_config.yaml
  1884. cluster_compute: many_ppo.yaml
  1885. run:
  1886. timeout: 86400
  1887. script: python workloads/many_ppo.py
  1888. long_running: true
  1889. wait_for_nodes:
  1890. num_nodes: 1
  1891. smoke_test:
  1892. frequency: nightly
  1893. run:
  1894. timeout: 3600
  1895. alert: long_running_tests
  1896. variations:
  1897. - __suffix__: aws
  1898. - __suffix__: gce
  1899. env: gce
  1900. frequency: manual
  1901. smoke_test:
  1902. frequency: manual
  1903. run:
  1904. timeout: 3600
  1905. cluster:
  1906. cluster_env: ../rllib_tests/app_config.yaml
  1907. cluster_compute: many_ppo_gce.yaml
  1908. - name: long_running_many_tasks
  1909. group: Long running tests
  1910. working_dir: long_running_tests
  1911. frequency: weekly
  1912. python: "3.8"
  1913. team: core
  1914. cluster:
  1915. byod:
  1916. runtime_env:
  1917. - RLLIB_TEST_NO_JAX_IMPORT=1
  1918. cluster_env: app_config.yaml
  1919. cluster_compute: tpl_cpu_1.yaml
  1920. run:
  1921. timeout: 86400
  1922. script: python workloads/many_tasks.py
  1923. long_running: true
  1924. smoke_test:
  1925. frequency: nightly
  1926. run:
  1927. timeout: 3600
  1928. alert: long_running_tests
  1929. variations:
  1930. - __suffix__: aws
  1931. - __suffix__: gce
  1932. env: gce
  1933. frequency: manual
  1934. smoke_test:
  1935. frequency: manual
  1936. run:
  1937. timeout: 3600
  1938. cluster:
  1939. cluster_env: app_config.yaml
  1940. cluster_compute: tpl_cpu_1_gce.yaml
  1941. - name: long_running_many_tasks_serialized_ids
  1942. group: Long running tests
  1943. working_dir: long_running_tests
  1944. frequency: weekly
  1945. python: "3.8"
  1946. team: core
  1947. cluster:
  1948. byod:
  1949. runtime_env:
  1950. - RLLIB_TEST_NO_JAX_IMPORT=1
  1951. cluster_env: app_config.yaml
  1952. cluster_compute: tpl_cpu_1.yaml
  1953. run:
  1954. timeout: 86400
  1955. script: python workloads/many_tasks_serialized_ids.py
  1956. long_running: true
  1957. smoke_test:
  1958. frequency: nightly
  1959. run:
  1960. timeout: 3600
  1961. alert: long_running_tests
  1962. variations:
  1963. - __suffix__: aws
  1964. - __suffix__: gce
  1965. env: gce
  1966. frequency: manual
  1967. smoke_test:
  1968. frequency: manual
  1969. run:
  1970. timeout: 3600
  1971. cluster:
  1972. cluster_env: app_config.yaml
  1973. cluster_compute: tpl_cpu_1_gce.yaml
  1974. - name: long_running_node_failures
  1975. group: Long running tests
  1976. working_dir: long_running_tests
  1977. frequency: weekly
  1978. python: "3.8"
  1979. team: core
  1980. cluster:
  1981. byod:
  1982. runtime_env:
  1983. - RLLIB_TEST_NO_JAX_IMPORT=1
  1984. cluster_env: app_config.yaml
  1985. cluster_compute: tpl_cpu_1.yaml
  1986. run:
  1987. timeout: 86400
  1988. script: python workloads/node_failures.py
  1989. long_running: true
  1990. smoke_test:
  1991. frequency: nightly
  1992. run:
  1993. timeout: 3600
  1994. alert: long_running_tests
  1995. variations:
  1996. - __suffix__: aws
  1997. - __suffix__: gce
  1998. env: gce
  1999. frequency: manual
  2000. smoke_test:
  2001. frequency: manual
  2002. run:
  2003. timeout: 3600
  2004. cluster:
  2005. cluster_env: app_config.yaml
  2006. cluster_compute: tpl_cpu_1_gce.yaml
  2007. - name: long_running_pbt
  2008. group: Long running tests
  2009. working_dir: long_running_tests
  2010. frequency: weekly
  2011. team: ml
  2012. cluster:
  2013. cluster_env: ../rllib_tests/app_config.yaml
  2014. cluster_compute: tpl_cpu_1.yaml
  2015. run:
  2016. timeout: 86400
  2017. script: python workloads/pbt.py
  2018. long_running: true
  2019. smoke_test:
  2020. frequency: nightly
  2021. run:
  2022. timeout: 3600
  2023. alert: long_running_tests
  2024. variations:
  2025. - __suffix__: aws
  2026. - __suffix__: gce
  2027. env: gce
  2028. frequency: manual
  2029. smoke_test:
  2030. frequency: manual
  2031. run:
  2032. timeout: 3600
  2033. cluster:
  2034. cluster_env: ../rllib_tests/app_config.yaml
  2035. cluster_compute: tpl_cpu_1_gce.yaml
  2036. - name: long_running_serve
  2037. group: Long running tests
  2038. working_dir: long_running_tests
  2039. frequency: weekly
  2040. team: serve
  2041. cluster:
  2042. cluster_env: app_config.yaml
  2043. cluster_compute: tpl_cpu_1.yaml
  2044. run:
  2045. timeout: 86400
  2046. script: python workloads/serve.py
  2047. long_running: true
  2048. smoke_test:
  2049. frequency: nightly
  2050. run:
  2051. timeout: 3600
  2052. alert: long_running_tests
  2053. variations:
  2054. - __suffix__: aws
  2055. - __suffix__: gce
  2056. env: gce
  2057. frequency: manual
  2058. smoke_test:
  2059. frequency: manual
  2060. run:
  2061. timeout: 3600
  2062. cluster:
  2063. cluster_env: app_config.yaml
  2064. cluster_compute: tpl_cpu_1_gce.yaml
  2065. - name: long_running_serve_failure
  2066. group: Long running tests
  2067. working_dir: long_running_tests
  2068. stable: true
  2069. frequency: weekly
  2070. team: serve
  2071. cluster:
  2072. cluster_env: app_config.yaml
  2073. cluster_compute: tpl_cpu_1_c5.yaml
  2074. run:
  2075. timeout: 86400
  2076. script: python workloads/serve_failure.py
  2077. long_running: true
  2078. smoke_test:
  2079. frequency: nightly
  2080. run:
  2081. timeout: 600
  2082. alert: long_running_tests
  2083. variations:
  2084. - __suffix__: aws
  2085. - __suffix__: gce
  2086. env: gce
  2087. frequency: manual
  2088. smoke_test:
  2089. frequency: manual
  2090. run:
  2091. timeout: 86400
  2092. cluster:
  2093. cluster_env: app_config.yaml
  2094. cluster_compute: tpl_cpu_1_c5_gce.yaml
  2095. - name: long_running_many_jobs
  2096. group: Long running tests
  2097. working_dir: long_running_tests
  2098. stable: true
  2099. frequency: weekly
  2100. team: serve
  2101. cluster:
  2102. cluster_env: app_config.yaml
  2103. cluster_compute: tpl_cpu_1.yaml
  2104. run:
  2105. timeout: 86400
  2106. script: python workloads/long_running_many_jobs.py --num-clients=1
  2107. long_running: true
  2108. smoke_test:
  2109. frequency: nightly
  2110. run:
  2111. timeout: 1800
  2112. alert: long_running_tests
  2113. variations:
  2114. - __suffix__: aws
  2115. - __suffix__: gce
  2116. env: gce
  2117. frequency: manual
  2118. smoke_test:
  2119. frequency: manual
  2120. run:
  2121. timeout: 3600
  2122. cluster:
  2123. cluster_env: app_config.yaml
  2124. cluster_compute: tpl_cpu_1_gce.yaml
  2125. - name: long_running_distributed_pytorch_pbt_failure
  2126. group: Long running tests
  2127. working_dir: long_running_distributed_tests
  2128. frequency: weekly
  2129. team: ml
  2130. cluster:
  2131. cluster_env: app_config.yaml
  2132. cluster_compute: compute_tpl.yaml
  2133. run:
  2134. timeout: 86400
  2135. script: python workloads/pytorch_pbt_failure.py
  2136. long_running: true
  2137. smoke_test:
  2138. frequency: manual
  2139. run:
  2140. timeout: 3600
  2141. alert: long_running_tests
  2142. variations:
  2143. - __suffix__: aws
  2144. - __suffix__: gce
  2145. env: gce
  2146. frequency: manual
  2147. smoke_test:
  2148. frequency: manual
  2149. run:
  2150. timeout: 3600
  2151. cluster:
  2152. cluster_env: app_config.yaml
  2153. cluster_compute: compute_tpl_gce.yaml
  2154. ########################
  2155. # Jobs tests
  2156. ########################
  2157. - name: jobs_basic_local_working_dir
  2158. group: Jobs tests
  2159. working_dir: jobs_tests
  2160. frequency: nightly
  2161. team: serve
  2162. cluster:
  2163. cluster_env: app_config.yaml
  2164. cluster_compute: compute_tpl_4_xlarge.yaml
  2165. run:
  2166. timeout: 600
  2167. script: python workloads/jobs_basic.py --working-dir "workloads"
  2168. wait_for_nodes:
  2169. num_nodes: 4
  2170. alert: default
  2171. variations:
  2172. - __suffix__: aws
  2173. - __suffix__: gce
  2174. env: gce
  2175. frequency: manual
  2176. cluster:
  2177. cluster_env: app_config.yaml
  2178. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  2179. - name: jobs_basic_remote_working_dir
  2180. group: Jobs tests
  2181. working_dir: jobs_tests
  2182. frequency: nightly
  2183. team: serve
  2184. cluster:
  2185. cluster_env: app_config.yaml
  2186. cluster_compute: compute_tpl_4_xlarge.yaml
  2187. run:
  2188. timeout: 600
  2189. script: python workloads/jobs_basic.py --working-dir "https://github.com/anyscale/job-services-cuj-examples/archive/refs/heads/main.zip"
  2190. wait_for_nodes:
  2191. num_nodes: 4
  2192. alert: default
  2193. variations:
  2194. - __suffix__: aws
  2195. - __suffix__: gce
  2196. env: gce
  2197. frequency: manual
  2198. cluster:
  2199. cluster_env: app_config.yaml
  2200. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  2201. - name: jobs_remote_multi_node
  2202. group: Jobs tests
  2203. team: serve
  2204. frequency: nightly
  2205. working_dir: jobs_tests
  2206. cluster:
  2207. cluster_env: app_config.yaml
  2208. cluster_compute: compute_tpl_4_xlarge.yaml
  2209. run:
  2210. timeout: 600
  2211. script: python workloads/jobs_remote_multi_node.py
  2212. wait_for_nodes:
  2213. num_nodes: 4
  2214. variations:
  2215. - __suffix__: aws
  2216. - __suffix__: gce
  2217. env: gce
  2218. frequency: manual
  2219. cluster:
  2220. cluster_env: app_config.yaml
  2221. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  2222. - name: jobs_check_cuda_available
  2223. group: Jobs tests
  2224. team: serve
  2225. frequency: nightly
  2226. working_dir: jobs_tests
  2227. cluster:
  2228. cluster_env: app_config.yaml
  2229. cluster_compute: compute_tpl_gpu_node.yaml
  2230. run:
  2231. timeout: 600
  2232. script: python workloads/jobs_check_cuda_available.py
  2233. wait_for_nodes:
  2234. num_nodes: 2
  2235. variations:
  2236. - __suffix__: aws
  2237. - __suffix__: gce
  2238. env: gce
  2239. frequency: manual
  2240. cluster:
  2241. cluster_env: app_config.yaml
  2242. cluster_compute: compute_tpl_gce_gpu_node.yaml
  2243. - name: jobs_specify_num_gpus
  2244. group: Jobs tests
  2245. team: serve
  2246. frequency: nightly
  2247. working_dir: jobs_tests
  2248. cluster:
  2249. cluster_env: app_config.yaml
  2250. cluster_compute: compute_tpl_gpu_worker.yaml
  2251. run:
  2252. timeout: 600
  2253. script: python workloads/jobs_specify_num_gpus.py --working-dir "workloads"
  2254. wait_for_nodes:
  2255. num_nodes: 2
  2256. variations:
  2257. - __suffix__: aws
  2258. - __suffix__: gce
  2259. env: gce
  2260. frequency: manual
  2261. cluster:
  2262. cluster_env: app_config.yaml
  2263. cluster_compute: compute_tpl_gce_gpu_worker.yaml
  2264. ########################
  2265. # Runtime env tests
  2266. ########################
  2267. - name: runtime_env_rte_many_tasks_actors
  2268. group: Runtime env tests
  2269. working_dir: runtime_env_tests
  2270. frequency: nightly
  2271. team: serve
  2272. cluster:
  2273. cluster_env: app_config.yaml
  2274. cluster_compute: rte_small.yaml
  2275. run:
  2276. timeout: 600
  2277. script: python workloads/rte_many_tasks_actors.py
  2278. wait_for_nodes:
  2279. num_nodes: 4
  2280. alert: default
  2281. variations:
  2282. - __suffix__: aws
  2283. - __suffix__: gce
  2284. env: gce
  2285. frequency: manual
  2286. cluster:
  2287. cluster_env: app_config.yaml
  2288. cluster_compute: rte_gce_small.yaml
  2289. - name: runtime_env_wheel_urls
  2290. group: Runtime env tests
  2291. working_dir: runtime_env_tests
  2292. frequency: nightly
  2293. team: serve
  2294. cluster:
  2295. cluster_env: app_config.yaml
  2296. cluster_compute: rte_minimal.yaml
  2297. run:
  2298. timeout: 9000
  2299. script: python workloads/wheel_urls.py
  2300. wait_for_nodes:
  2301. num_nodes: 1
  2302. alert: default
  2303. variations:
  2304. - __suffix__: aws
  2305. - __suffix__: gce
  2306. env: gce
  2307. frequency: manual
  2308. cluster:
  2309. cluster_env: app_config.yaml
  2310. cluster_compute: rte_gce_minimal.yaml
  2311. # It seems like the consensus is that this should be tested in CI, and not in a nightly test.
  2312. # - name: runtime_env_rte_ray_client
  2313. # group: Runtime env tests
  2314. # working_dir: runtime_env_tests
  2315. # frequency: nightly
  2316. # team: serve
  2317. # cluster:
  2318. # cluster_env: app_config.yaml
  2319. # cluster_compute: rte_minimal.yaml
  2320. # run:
  2321. # timeout: 600
  2322. # script: python workloads/rte_ray_client.py
  2323. # wait_for_nodes:
  2324. # num_nodes: 1
  2325. # alert: default
  2326. ########################
  2327. # Serve tests
  2328. ########################
  2329. - name: serve_single_deployment_1k_noop_replica
  2330. group: Serve tests
  2331. working_dir: serve_tests
  2332. frequency: nightly
  2333. team: serve
  2334. cluster:
  2335. cluster_env: app_config.yaml
  2336. cluster_compute: compute_tpl_32_cpu.yaml
  2337. run:
  2338. timeout: 7200
  2339. long_running: false
  2340. script: python workloads/single_deployment_1k_noop_replica.py
  2341. alert: default
  2342. variations:
  2343. - __suffix__: aws
  2344. - __suffix__: gce
  2345. env: gce
  2346. frequency: manual
  2347. cluster:
  2348. cluster_env: app_config.yaml
  2349. cluster_compute: compute_tpl_32_cpu_gce.yaml
  2350. - name: serve_multi_deployment_1k_noop_replica
  2351. group: Serve tests
  2352. working_dir: serve_tests
  2353. frequency: nightly
  2354. team: serve
  2355. cluster:
  2356. cluster_env: app_config.yaml
  2357. cluster_compute: compute_tpl_32_cpu.yaml
  2358. run:
  2359. timeout: 7200
  2360. long_running: false
  2361. script: python workloads/multi_deployment_1k_noop_replica.py
  2362. alert: default
  2363. variations:
  2364. - __suffix__: aws
  2365. - __suffix__: gce
  2366. env: gce
  2367. frequency: manual
  2368. cluster:
  2369. cluster_env: app_config.yaml
  2370. cluster_compute: compute_tpl_32_cpu_gce.yaml
  2371. - name: serve_autoscaling_single_deployment
  2372. group: Serve tests
  2373. working_dir: serve_tests
  2374. frequency: nightly
  2375. team: serve
  2376. cluster:
  2377. cluster_env: app_config.yaml
  2378. cluster_compute: compute_tpl_8_cpu_autoscaling.yaml
  2379. run:
  2380. timeout: 7200
  2381. long_running: false
  2382. script: python workloads/autoscaling_single_deployment.py
  2383. alert: default
  2384. variations:
  2385. - __suffix__: aws
  2386. - __suffix__: gce
  2387. env: gce
  2388. frequency: manual
  2389. cluster:
  2390. cluster_env: app_config.yaml
  2391. cluster_compute: compute_tpl_8_cpu_autoscaling_gce.yaml
  2392. - name: serve_autoscaling_multi_deployment
  2393. group: Serve tests
  2394. working_dir: serve_tests
  2395. frequency: nightly
  2396. team: serve
  2397. cluster:
  2398. cluster_env: app_config.yaml
  2399. cluster_compute: compute_tpl_32_cpu_autoscaling.yaml
  2400. run:
  2401. timeout: 7200
  2402. long_running: false
  2403. script: python workloads/autoscaling_multi_deployment.py
  2404. alert: default
  2405. variations:
  2406. - __suffix__: aws
  2407. - __suffix__: gce
  2408. env: gce
  2409. frequency: manual
  2410. cluster:
  2411. cluster_env: app_config.yaml
  2412. cluster_compute: compute_tpl_32_cpu_autoscaling_gce.yaml
  2413. - name: serve_serve_micro_benchmark
  2414. group: Serve tests
  2415. working_dir: serve_tests
  2416. frequency: nightly
  2417. team: serve
  2418. cluster:
  2419. cluster_env: app_config.yaml
  2420. cluster_compute: compute_tpl_single_node.yaml
  2421. run:
  2422. timeout: 7200
  2423. long_running: false
  2424. script: python workloads/serve_micro_benchmark.py
  2425. alert: default
  2426. variations:
  2427. - __suffix__: aws
  2428. - __suffix__: gce
  2429. env: gce
  2430. frequency: manual
  2431. cluster:
  2432. cluster_env: app_config.yaml
  2433. cluster_compute: compute_tpl_single_node_gce.yaml
  2434. # - name: serve_serve_micro_benchmark_k8s
  2435. # group: Serve tests
  2436. # working_dir: serve_tests
  2437. # # TODO(architkulkarni) Reenable after K8s migration. Currently failing
  2438. # frequency: manual
  2439. # team: serve
  2440. # cluster:
  2441. # cluster_env: app_config.yaml
  2442. # cluster_compute: compute_tpl_single_node_k8s.yaml
  2443. # run:
  2444. # timeout: 7200
  2445. # long_running: false
  2446. # script: python workloads/serve_micro_benchmark.py
  2447. # alert: default
  2448. - name: deployment_graph_long_chain
  2449. group: Serve tests
  2450. working_dir: serve_tests
  2451. frequency: nightly
  2452. team: serve
  2453. cluster:
  2454. cluster_env: app_config.yaml
  2455. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2456. run:
  2457. timeout: 3600
  2458. long_running: false
  2459. script: python workloads/deployment_graph_long_chain.py --chain-length=10 --num-clients=4 --local-test=False
  2460. alert: default
  2461. stable: False
  2462. variations:
  2463. - __suffix__: aws
  2464. - __suffix__: gce
  2465. env: gce
  2466. frequency: manual
  2467. cluster:
  2468. cluster_env: app_config.yaml
  2469. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2470. - name: deployment_graph_wide_ensemble
  2471. group: Serve tests
  2472. working_dir: serve_tests
  2473. frequency: nightly
  2474. team: serve
  2475. cluster:
  2476. cluster_env: app_config.yaml
  2477. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2478. run:
  2479. timeout: 3600
  2480. long_running: false
  2481. script: python workloads/deployment_graph_wide_ensemble.py --fanout-degree=10 --num-clients=4 --local-test=False
  2482. alert: default
  2483. stable: False
  2484. variations:
  2485. - __suffix__: aws
  2486. - __suffix__: gce
  2487. env: gce
  2488. frequency: manual
  2489. cluster:
  2490. cluster_env: app_config.yaml
  2491. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2492. - name: serve_handle_long_chain
  2493. group: Serve tests
  2494. working_dir: serve_tests
  2495. frequency: nightly
  2496. team: serve
  2497. cluster:
  2498. cluster_env: app_config.yaml
  2499. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2500. run:
  2501. timeout: 3600
  2502. long_running: false
  2503. script: python workloads/serve_handle_long_chain.py --chain-length=10 --num-clients=4 --local-test=False
  2504. alert: default
  2505. stable: False
  2506. variations:
  2507. - __suffix__: aws
  2508. - __suffix__: gce
  2509. env: gce
  2510. frequency: manual
  2511. cluster:
  2512. cluster_env: app_config.yaml
  2513. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2514. - name: serve_handle_wide_ensemble
  2515. group: Serve tests
  2516. working_dir: serve_tests
  2517. frequency: nightly
  2518. team: serve
  2519. cluster:
  2520. cluster_env: app_config.yaml
  2521. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2522. run:
  2523. timeout: 3600
  2524. long_running: false
  2525. script: python workloads/serve_handle_wide_ensemble.py --fanout-degree=10 --num-clients=4 --local-test=False
  2526. alert: default
  2527. stable: False
  2528. variations:
  2529. - __suffix__: aws
  2530. - __suffix__: gce
  2531. env: gce
  2532. frequency: manual
  2533. cluster:
  2534. cluster_env: app_config.yaml
  2535. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2536. - name: serve_micro_protocol_grpc_benchmark
  2537. group: Serve tests
  2538. working_dir: serve_tests
  2539. frequency: nightly
  2540. team: serve
  2541. cluster:
  2542. cluster_env: app_config.yaml
  2543. cluster_compute: compute_tpl_single_node.yaml
  2544. run:
  2545. timeout: 7200
  2546. long_running: false
  2547. script: python workloads/serve_protocol_benchmark.py --data-size=1048576
  2548. alert: default
  2549. variations:
  2550. - __suffix__: aws
  2551. - __suffix__: gce
  2552. env: gce
  2553. frequency: manual
  2554. cluster:
  2555. cluster_env: app_config.yaml
  2556. cluster_compute: compute_tpl_single_node_gce.yaml
  2557. - name: serve_micro_protocol_http_benchmark
  2558. group: Serve tests
  2559. working_dir: serve_tests
  2560. frequency: nightly
  2561. team: serve
  2562. cluster:
  2563. cluster_env: app_config.yaml
  2564. cluster_compute: compute_tpl_single_node.yaml
  2565. run:
  2566. timeout: 7200
  2567. long_running: false
  2568. script: python workloads/serve_protocol_benchmark.py --data-size=1048576 --http-test
  2569. alert: default
  2570. variations:
  2571. - __suffix__: aws
  2572. - __suffix__: gce
  2573. env: gce
  2574. frequency: manual
  2575. cluster:
  2576. cluster_env: app_config.yaml
  2577. cluster_compute: compute_tpl_single_node_gce.yaml
  2578. - name: serve_resnet_benchmark
  2579. group: Serve tests
  2580. working_dir: serve_tests
  2581. frequency: nightly
  2582. team: serve
  2583. cluster:
  2584. cluster_env: gpu_app_config.yaml
  2585. cluster_compute: compute_tpl_gpu_node.yaml
  2586. run:
  2587. timeout: 7200
  2588. long_running: false
  2589. script: python workloads/serve_resnet_benchmark.py --gpu-env
  2590. alert: default
  2591. variations:
  2592. - __suffix__: aws
  2593. - __suffix__: gce
  2594. env: gce
  2595. frequency: manual
  2596. cluster:
  2597. cluster_env: gpu_app_config.yaml
  2598. cluster_compute: compute_tpl_gpu_node_gce.yaml
  2599. ########################
  2600. # Train tests
  2601. ########################
  2602. - name: train_horovod_multi_node_test
  2603. group: Train tests
  2604. working_dir: train_tests/horovod
  2605. frequency: nightly
  2606. team: ml
  2607. cluster:
  2608. cluster_env: app_config.yaml
  2609. cluster_compute: compute_tpl_aws.yaml
  2610. run:
  2611. timeout: 3000
  2612. script: python train_horovod_multi_node_test.py
  2613. wait_for_nodes:
  2614. num_nodes: 2
  2615. variations:
  2616. - __suffix__: aws
  2617. - __suffix__: gce
  2618. env: gce
  2619. frequency: manual
  2620. cluster:
  2621. cluster_env: app_config.yaml
  2622. cluster_compute: compute_tpl_gce.yaml
  2623. alert: default
  2624. ########################
  2625. # Alpa tests
  2626. ########################
  2627. - name: alpa_opt_2_7b_sanity_check
  2628. group: Alpa tests
  2629. working_dir: alpa_tests
  2630. frequency: nightly
  2631. team: ml
  2632. cluster:
  2633. cluster_env: app_config.yaml
  2634. cluster_compute: gpu_2x4_t4_aws.yaml
  2635. run:
  2636. timeout: 3600
  2637. script: bash run_train_opt_2_7b.sh --storage aws
  2638. wait_for_nodes:
  2639. num_nodes: 2
  2640. variations:
  2641. - __suffix__: aws
  2642. - __suffix__: gce
  2643. env: gce
  2644. frequency: manual
  2645. cluster:
  2646. cluster_env: app_config.yaml
  2647. cluster_compute: gpu_2x4_t4_gce.yaml
  2648. run:
  2649. timeout: 3600
  2650. script: bash run_train_opt_2_7b.sh --storage gcs
  2651. wait_for_nodes:
  2652. num_nodes: 2
  2653. alert: default
  2654. - name: alpa_opt_30b_inference
  2655. group: Alpa tests
  2656. working_dir: alpa_tests
  2657. frequency: nightly
  2658. team: ml
  2659. cluster:
  2660. cluster_env: app_config.yaml
  2661. cluster_compute: gpu_1x8_v100_aws.yaml
  2662. run:
  2663. timeout: 3600
  2664. script: bash run_inference_opt_30b.sh --storage aws
  2665. wait_for_nodes:
  2666. num_nodes: 1
  2667. variations:
  2668. - __suffix__: aws
  2669. - __suffix__: gce
  2670. env: gce
  2671. frequency: manual
  2672. cluster:
  2673. cluster_env: app_config.yaml
  2674. cluster_compute: gpu_1x8_v100_gce.yaml
  2675. run:
  2676. timeout: 3600
  2677. script: bash run_inference_opt_30b.sh --storage gcs
  2678. wait_for_nodes:
  2679. num_nodes: 1
  2680. alert: default
  2681. ########################
  2682. # RLlib tests
  2683. ########################
  2684. - name: rllib_learner_group_checkpointing_multinode
  2685. group: RLlib tests
  2686. working_dir: rllib_tests
  2687. frequency: nightly
  2688. team: rllib
  2689. cluster:
  2690. cluster_env: app_config.yaml
  2691. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2692. run:
  2693. timeout: 3600
  2694. script: pytest checkpointing_tests/test_learner_group_checkpointing.py
  2695. wait_for_nodes:
  2696. num_nodes: 3
  2697. alert: default
  2698. variations:
  2699. - __suffix__: aws
  2700. - __suffix__: gce
  2701. env: gce
  2702. frequency: manual
  2703. cluster:
  2704. cluster_env: app_config.yaml
  2705. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2706. - name: rllib_learner_e2e_module_loading
  2707. group: RLlib tests
  2708. working_dir: rllib_tests
  2709. frequency: nightly
  2710. team: rllib
  2711. cluster:
  2712. cluster_env: app_config.yaml
  2713. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2714. run:
  2715. timeout: 3600
  2716. script: pytest checkpointing_tests/test_e2e_rl_module_restore.py
  2717. wait_for_nodes:
  2718. num_nodes: 3
  2719. alert: default
  2720. variations:
  2721. - __suffix__: aws
  2722. - __suffix__: gce
  2723. env: gce
  2724. frequency: manual
  2725. cluster:
  2726. cluster_env: app_config.yaml
  2727. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2728. - name: rllib_multi_node_e2e_training_smoke_test
  2729. group: RLlib tests
  2730. working_dir: rllib_tests
  2731. frequency: nightly
  2732. team: rllib
  2733. cluster:
  2734. cluster_env: app_config.yaml
  2735. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2736. run:
  2737. timeout: 3600
  2738. script: pytest smoke_tests/smoke_test_basic_multi_node_training_learner.py
  2739. wait_for_nodes:
  2740. num_nodes: 3
  2741. alert: default
  2742. variations:
  2743. - __suffix__: aws
  2744. - __suffix__: gce
  2745. env: gce
  2746. frequency: manual
  2747. cluster:
  2748. cluster_env: app_config.yaml
  2749. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2750. - name: rllib_learning_tests_a2c_tf
  2751. group: RLlib tests
  2752. working_dir: rllib_tests
  2753. frequency: nightly
  2754. team: rllib
  2755. cluster:
  2756. cluster_env: app_config.yaml
  2757. cluster_compute: 1gpu_16cpus.yaml
  2758. run:
  2759. timeout: 18000
  2760. script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=tf
  2761. alert: default
  2762. variations:
  2763. - __suffix__: aws
  2764. - __suffix__: gce
  2765. env: gce
  2766. frequency: manual
  2767. cluster:
  2768. cluster_env: app_config.yaml
  2769. cluster_compute: 1gpu_16cpus_gce.yaml
  2770. - name: rllib_learning_tests_a2c_torch
  2771. group: RLlib tests
  2772. working_dir: rllib_tests
  2773. frequency: nightly
  2774. team: rllib
  2775. cluster:
  2776. cluster_env: app_config.yaml
  2777. cluster_compute: 1gpu_16cpus.yaml
  2778. run:
  2779. timeout: 18000
  2780. script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=torch
  2781. alert: default
  2782. variations:
  2783. - __suffix__: aws
  2784. - __suffix__: gce
  2785. env: gce
  2786. frequency: manual
  2787. cluster:
  2788. cluster_env: app_config.yaml
  2789. cluster_compute: 1gpu_16cpus_gce.yaml
  2790. - name: rllib_learning_tests_a3c_tf
  2791. group: RLlib tests
  2792. working_dir: rllib_tests
  2793. frequency: nightly
  2794. team: rllib
  2795. cluster:
  2796. cluster_env: app_config.yaml
  2797. cluster_compute: 32cpus.yaml
  2798. run:
  2799. timeout: 18000
  2800. script: python learning_tests/run.py --yaml-sub-dir=a3c --framework=tf
  2801. alert: default
  2802. variations:
  2803. - __suffix__: aws
  2804. - __suffix__: gce
  2805. env: gce
  2806. frequency: manual
  2807. cluster:
  2808. cluster_env: app_config.yaml
  2809. cluster_compute: 32cpus_gce.yaml
  2810. - name: rllib_learning_tests_apex_tf
  2811. group: RLlib tests
  2812. working_dir: rllib_tests
  2813. # Marking as unstable since it's currently expected to fail.
  2814. stable: false
  2815. frequency: nightly
  2816. team: rllib
  2817. cluster:
  2818. cluster_env: app_config.yaml
  2819. cluster_compute: 1gpu_24cpus.yaml
  2820. run:
  2821. timeout: 18000
  2822. script: python learning_tests/run.py --yaml-sub-dir=apex --framework=tf
  2823. alert: default
  2824. variations:
  2825. - __suffix__: aws
  2826. - __suffix__: gce
  2827. env: gce
  2828. frequency: manual
  2829. cluster:
  2830. cluster_env: app_config.yaml
  2831. cluster_compute: 1gpu_24cpus_gce.yaml
  2832. - name: rllib_learning_tests_apex_torch
  2833. group: RLlib tests
  2834. working_dir: rllib_tests
  2835. frequency: nightly
  2836. team: rllib
  2837. cluster:
  2838. cluster_env: app_config.yaml
  2839. cluster_compute: 1gpu_24cpus.yaml
  2840. run:
  2841. timeout: 18000
  2842. script: python learning_tests/run.py --yaml-sub-dir=apex --framework=torch
  2843. alert: default
  2844. variations:
  2845. - __suffix__: aws
  2846. - __suffix__: gce
  2847. env: gce
  2848. frequency: manual
  2849. cluster:
  2850. cluster_env: app_config.yaml
  2851. cluster_compute: 1gpu_24cpus_gce.yaml
  2852. - name: rllib_learning_tests_appo_tf
  2853. group: RLlib tests
  2854. working_dir: rllib_tests
  2855. frequency: nightly
  2856. team: rllib
  2857. cluster:
  2858. cluster_env: app_config.yaml
  2859. cluster_compute: 4gpus_64cpus.yaml
  2860. run:
  2861. timeout: 18000
  2862. script: python learning_tests/run.py --yaml-sub-dir=appo/new_stack --framework=tf2
  2863. alert: default
  2864. variations:
  2865. - __suffix__: aws
  2866. - __suffix__: gce
  2867. env: gce
  2868. frequency: manual
  2869. cluster:
  2870. cluster_env: app_config.yaml
  2871. cluster_compute: 4gpus_64cpus_gce.yaml
  2872. - name: rllib_learning_tests_appo_torch
  2873. group: RLlib tests
  2874. working_dir: rllib_tests
  2875. # Marking as unstable since it's currently expected to fail.
  2876. stable: false
  2877. frequency: nightly
  2878. team: rllib
  2879. cluster:
  2880. cluster_env: app_config.yaml
  2881. cluster_compute: 4gpus_64cpus.yaml
  2882. run:
  2883. timeout: 18000
  2884. script: python learning_tests/run.py --yaml-sub-dir=appo/new_stack --framework=torch
  2885. alert: default
  2886. variations:
  2887. - __suffix__: aws
  2888. - __suffix__: gce
  2889. env: gce
  2890. frequency: manual
  2891. cluster:
  2892. cluster_env: app_config.yaml
  2893. cluster_compute: 4gpus_64cpus_gce.yaml
  2894. # TODO (sven): Remove this test once we are on new stack by default for APPO.
  2895. - name: rllib_learning_tests_appo_old_stack_tf
  2896. group: RLlib tests
  2897. working_dir: rllib_tests
  2898. frequency: nightly
  2899. team: rllib
  2900. cluster:
  2901. cluster_env: app_config.yaml
  2902. cluster_compute: 2gpus_32cpus.yaml
  2903. run:
  2904. timeout: 18000
  2905. script: python learning_tests/run.py --yaml-sub-dir=appo/old_stack --framework=tf
  2906. alert: default
  2907. variations:
  2908. - __suffix__: aws
  2909. - __suffix__: gce
  2910. env: gce
  2911. frequency: manual
  2912. cluster:
  2913. cluster_env: app_config.yaml
  2914. cluster_compute: 2gpus_32cpus_gce.yaml
  2915. # TODO (sven): Remove this test once we are on new stack by default for APPO.
  2916. - name: rllib_learning_tests_appo_old_stack_torch
  2917. group: RLlib tests
  2918. working_dir: rllib_tests
  2919. # Marking as unstable since it's currently expected to fail.
  2920. stable: false
  2921. frequency: nightly
  2922. team: rllib
  2923. cluster:
  2924. cluster_env: app_config.yaml
  2925. cluster_compute: 2gpus_32cpus.yaml
  2926. run:
  2927. timeout: 18000
  2928. script: python learning_tests/run.py --yaml-sub-dir=appo/old_stack --framework=torch
  2929. alert: default
  2930. variations:
  2931. - __suffix__: aws
  2932. - __suffix__: gce
  2933. env: gce
  2934. frequency: manual
  2935. cluster:
  2936. cluster_env: app_config.yaml
  2937. cluster_compute: 2gpus_32cpus_gce.yaml
  2938. - name: rllib_learning_tests_bc_tf
  2939. group: RLlib tests
  2940. working_dir: rllib_tests
  2941. frequency: nightly
  2942. team: rllib
  2943. cluster:
  2944. cluster_env: app_config.yaml
  2945. cluster_compute: 1gpu_16cpus.yaml
  2946. run:
  2947. timeout: 18000
  2948. script: python learning_tests/run.py --yaml-sub-dir=bc --framework=tf
  2949. alert: default
  2950. variations:
  2951. - __suffix__: aws
  2952. - __suffix__: gce
  2953. env: gce
  2954. frequency: manual
  2955. cluster:
  2956. cluster_env: app_config.yaml
  2957. cluster_compute: 1gpu_16cpus_gce.yaml
  2958. - name: rllib_learning_tests_bc_torch
  2959. group: RLlib tests
  2960. working_dir: rllib_tests
  2961. frequency: nightly
  2962. team: rllib
  2963. cluster:
  2964. cluster_env: app_config.yaml
  2965. cluster_compute: 1gpu_16cpus.yaml
  2966. run:
  2967. timeout: 18000
  2968. script: python learning_tests/run.py --yaml-sub-dir=bc --framework=torch
  2969. alert: default
  2970. variations:
  2971. - __suffix__: aws
  2972. - __suffix__: gce
  2973. env: gce
  2974. frequency: manual
  2975. cluster:
  2976. cluster_env: app_config.yaml
  2977. cluster_compute: 1gpu_16cpus_gce.yaml
  2978. - name: rllib_learning_tests_cql_tf
  2979. group: RLlib tests
  2980. working_dir: rllib_tests
  2981. frequency: nightly
  2982. team: rllib
  2983. # Marking as unstable since it's currently expected to fail.
  2984. stable: false
  2985. cluster:
  2986. cluster_env: app_config.yaml
  2987. cluster_compute: 1gpu_16cpus.yaml
  2988. run:
  2989. timeout: 18000
  2990. script: python learning_tests/run.py --yaml-sub-dir=cql --framework=tf
  2991. alert: default
  2992. variations:
  2993. - __suffix__: aws
  2994. - __suffix__: gce
  2995. env: gce
  2996. frequency: manual
  2997. cluster:
  2998. cluster_env: app_config.yaml
  2999. cluster_compute: 1gpu_16cpus_gce.yaml
  3000. - name: rllib_learning_tests_cql_torch
  3001. group: RLlib tests
  3002. working_dir: rllib_tests
  3003. # Marking as unstable since it's currently expected to fail.
  3004. stable: false
  3005. frequency: nightly
  3006. team: rllib
  3007. cluster:
  3008. cluster_env: app_config.yaml
  3009. cluster_compute: 1gpu_16cpus.yaml
  3010. run:
  3011. timeout: 18000
  3012. script: python learning_tests/run.py --yaml-sub-dir=cql --framework=torch
  3013. alert: default
  3014. variations:
  3015. - __suffix__: aws
  3016. - __suffix__: gce
  3017. env: gce
  3018. frequency: manual
  3019. cluster:
  3020. cluster_env: app_config.yaml
  3021. cluster_compute: 1gpu_16cpus_gce.yaml
  3022. - name: rllib_learning_tests_ddpg_tf
  3023. group: RLlib tests
  3024. working_dir: rllib_tests
  3025. frequency: nightly
  3026. team: rllib
  3027. cluster:
  3028. cluster_env: app_config.yaml
  3029. cluster_compute: 1gpu_16cpus.yaml
  3030. run:
  3031. timeout: 18000
  3032. script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=tf
  3033. alert: default
  3034. variations:
  3035. - __suffix__: aws
  3036. - __suffix__: gce
  3037. env: gce
  3038. frequency: manual
  3039. cluster:
  3040. cluster_env: app_config.yaml
  3041. cluster_compute: 1gpu_16cpus_gce.yaml
  3042. - name: rllib_learning_tests_ddpg_torch
  3043. group: RLlib tests
  3044. working_dir: rllib_tests
  3045. frequency: nightly
  3046. team: rllib
  3047. cluster:
  3048. cluster_env: app_config.yaml
  3049. cluster_compute: 1gpu_16cpus.yaml
  3050. run:
  3051. timeout: 18000
  3052. script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=torch
  3053. alert: default
  3054. variations:
  3055. - __suffix__: aws
  3056. - __suffix__: gce
  3057. env: gce
  3058. frequency: manual
  3059. cluster:
  3060. cluster_env: app_config.yaml
  3061. cluster_compute: 1gpu_16cpus_gce.yaml
  3062. - name: rllib_learning_tests_dqn_tf
  3063. group: RLlib tests
  3064. working_dir: rllib_tests
  3065. frequency: nightly
  3066. team: rllib
  3067. cluster:
  3068. cluster_env: app_config.yaml
  3069. cluster_compute: 1gpu_16cpus.yaml
  3070. run:
  3071. timeout: 18000
  3072. script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=tf
  3073. alert: default
  3074. variations:
  3075. - __suffix__: aws
  3076. - __suffix__: gce
  3077. env: gce
  3078. frequency: manual
  3079. cluster:
  3080. cluster_env: app_config.yaml
  3081. cluster_compute: 1gpu_16cpus_gce.yaml
  3082. - name: rllib_learning_tests_dqn_torch
  3083. group: RLlib tests
  3084. working_dir: rllib_tests
  3085. # Marking as unstable since it's currently expected to fail.
  3086. stable: false
  3087. frequency: nightly
  3088. team: rllib
  3089. cluster:
  3090. cluster_env: app_config.yaml
  3091. cluster_compute: 1gpu_16cpus.yaml
  3092. run:
  3093. timeout: 18000
  3094. script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=torch
  3095. alert: default
  3096. variations:
  3097. - __suffix__: aws
  3098. - __suffix__: gce
  3099. env: gce
  3100. frequency: manual
  3101. cluster:
  3102. cluster_env: app_config.yaml
  3103. cluster_compute: 1gpu_16cpus_gce.yaml
  3104. - name: rllib_learning_tests_es_tf
  3105. group: RLlib tests
  3106. working_dir: rllib_tests
  3107. frequency: nightly
  3108. team: rllib
  3109. cluster:
  3110. cluster_env: app_config.yaml
  3111. cluster_compute: 2gpus_64cpus.yaml
  3112. run:
  3113. timeout: 18000
  3114. script: python learning_tests/run.py --yaml-sub-dir=es --framework=tf
  3115. alert: default
  3116. variations:
  3117. - __suffix__: aws
  3118. - __suffix__: gce
  3119. env: gce
  3120. frequency: manual
  3121. cluster:
  3122. cluster_env: app_config.yaml
  3123. cluster_compute: 2gpus_64cpus_gce.yaml
  3124. - name: rllib_learning_tests_es_torch
  3125. group: RLlib tests
  3126. working_dir: rllib_tests
  3127. frequency: nightly
  3128. team: rllib
  3129. cluster:
  3130. cluster_env: app_config.yaml
  3131. cluster_compute: 2gpus_64cpus.yaml
  3132. run:
  3133. timeout: 18000
  3134. script: python learning_tests/run.py --yaml-sub-dir=es --framework=torch
  3135. alert: default
  3136. variations:
  3137. - __suffix__: aws
  3138. - __suffix__: gce
  3139. env: gce
  3140. frequency: manual
  3141. cluster:
  3142. cluster_env: app_config.yaml
  3143. cluster_compute: 2gpus_64cpus_gce.yaml
  3144. - name: rllib_learning_tests_impala_tf
  3145. group: RLlib tests
  3146. working_dir: rllib_tests
  3147. frequency: nightly
  3148. team: rllib
  3149. cluster:
  3150. cluster_env: app_config.yaml
  3151. cluster_compute: 1gpu_16cpus.yaml
  3152. run:
  3153. timeout: 18000
  3154. script: python learning_tests/run.py --yaml-sub-dir=impala --framework=tf
  3155. alert: default
  3156. variations:
  3157. - __suffix__: aws
  3158. - __suffix__: gce
  3159. env: gce
  3160. frequency: manual
  3161. cluster:
  3162. cluster_env: app_config.yaml
  3163. cluster_compute: 1gpu_16cpus_gce.yaml
  3164. - name: rllib_learning_tests_impala_torch
  3165. group: RLlib tests
  3166. working_dir: rllib_tests
  3167. frequency: nightly
  3168. team: rllib
  3169. cluster:
  3170. cluster_env: app_config.yaml
  3171. cluster_compute: 1gpu_16cpus.yaml
  3172. run:
  3173. timeout: 18000
  3174. script: python learning_tests/run.py --yaml-sub-dir=impala --framework=torch
  3175. alert: default
  3176. variations:
  3177. - __suffix__: aws
  3178. - __suffix__: gce
  3179. env: gce
  3180. frequency: manual
  3181. cluster:
  3182. cluster_env: app_config.yaml
  3183. cluster_compute: 1gpu_16cpus_gce.yaml
  3184. - name: rllib_learning_tests_marwil_tf
  3185. group: RLlib tests
  3186. working_dir: rllib_tests
  3187. # Marking as unstable since it's currently expected to fail.
  3188. stable: false
  3189. frequency: nightly
  3190. team: rllib
  3191. cluster:
  3192. cluster_env: app_config.yaml
  3193. cluster_compute: 1gpu_16cpus.yaml
  3194. run:
  3195. timeout: 18000
  3196. script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=tf
  3197. alert: default
  3198. variations:
  3199. - __suffix__: aws
  3200. - __suffix__: gce
  3201. env: gce
  3202. frequency: manual
  3203. cluster:
  3204. cluster_env: app_config.yaml
  3205. cluster_compute: 1gpu_16cpus_gce.yaml
  3206. - name: rllib_learning_tests_marwil_torch
  3207. group: RLlib tests
  3208. working_dir: rllib_tests
  3209. # Marking as unstable since it's currently expected to fail.
  3210. stable: false
  3211. frequency: nightly
  3212. team: rllib
  3213. cluster:
  3214. cluster_env: app_config.yaml
  3215. cluster_compute: 1gpu_16cpus.yaml
  3216. run:
  3217. timeout: 18000
  3218. script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=torch
  3219. alert: default
  3220. variations:
  3221. - __suffix__: aws
  3222. - __suffix__: gce
  3223. env: gce
  3224. frequency: manual
  3225. cluster:
  3226. cluster_env: app_config.yaml
  3227. cluster_compute: 1gpu_16cpus_gce.yaml
  3228. - name: rllib_learning_tests_ppo_tf
  3229. group: RLlib tests
  3230. working_dir: rllib_tests
  3231. frequency: nightly
  3232. team: rllib
  3233. cluster:
  3234. cluster_env: app_config.yaml
  3235. cluster_compute: 4gpus_64cpus.yaml
  3236. run:
  3237. timeout: 18000
  3238. script: python learning_tests/run.py --yaml-sub-dir=ppo/new_stack --framework=tf2
  3239. alert: default
  3240. variations:
  3241. - __suffix__: aws
  3242. - __suffix__: gce
  3243. env: gce
  3244. frequency: manual
  3245. cluster:
  3246. cluster_env: app_config.yaml
  3247. cluster_compute: 4gpus_64cpus_gce.yaml
  3248. - name: rllib_learning_tests_ppo_torch
  3249. group: RLlib tests
  3250. working_dir: rllib_tests
  3251. # Marking as unstable since it's currently expected to fail.
  3252. stable: false
  3253. frequency: nightly
  3254. team: rllib
  3255. cluster:
  3256. cluster_env: app_config.yaml
  3257. cluster_compute: 4gpus_64cpus.yaml
  3258. run:
  3259. timeout: 18000
  3260. script: python learning_tests/run.py --yaml-sub-dir=ppo/new_stack --framework=torch
  3261. alert: default
  3262. variations:
  3263. - __suffix__: aws
  3264. - __suffix__: gce
  3265. env: gce
  3266. frequency: manual
  3267. cluster:
  3268. cluster_env: app_config.yaml
  3269. cluster_compute: 4gpus_64cpus_gce.yaml
  3270. # TODO (sven): Remove this test once we are on new stack by default for APPO.
  3271. - name: rllib_learning_tests_ppo_old_stack_tf
  3272. group: RLlib tests
  3273. working_dir: rllib_tests
  3274. frequency: nightly
  3275. team: rllib
  3276. cluster:
  3277. cluster_env: app_config.yaml
  3278. cluster_compute: 2gpus_32cpus.yaml
  3279. run:
  3280. timeout: 18000
  3281. script: python learning_tests/run.py --yaml-sub-dir=ppo/old_stack --framework=tf
  3282. alert: default
  3283. variations:
  3284. - __suffix__: aws
  3285. - __suffix__: gce
  3286. env: gce
  3287. frequency: manual
  3288. cluster:
  3289. cluster_env: app_config.yaml
  3290. cluster_compute: 2gpus_32cpus_gce.yaml
  3291. # TODO (sven): Remove this test once we are on new stack by default for APPO.
  3292. - name: rllib_learning_tests_ppo_old_stack_torch
  3293. group: RLlib tests
  3294. working_dir: rllib_tests
  3295. # Marking as unstable since it's currently expected to fail.
  3296. stable: false
  3297. frequency: nightly
  3298. team: rllib
  3299. cluster:
  3300. cluster_env: app_config.yaml
  3301. cluster_compute: 2gpus_32cpus.yaml
  3302. run:
  3303. timeout: 18000
  3304. script: python learning_tests/run.py --yaml-sub-dir=ppo/old_stack --framework=torch
  3305. alert: default
  3306. variations:
  3307. - __suffix__: aws
  3308. - __suffix__: gce
  3309. env: gce
  3310. frequency: manual
  3311. cluster:
  3312. cluster_env: app_config.yaml
  3313. cluster_compute: 2gpus_32cpus_gce.yaml
  3314. - name: rllib_learning_tests_sac_tf
  3315. group: RLlib tests
  3316. working_dir: rllib_tests
  3317. frequency: nightly
  3318. team: rllib
  3319. cluster:
  3320. cluster_env: app_config.yaml
  3321. cluster_compute: 1gpu_16cpus.yaml
  3322. run:
  3323. timeout: 18000
  3324. script: python learning_tests/run.py --yaml-sub-dir=sac --framework=tf
  3325. alert: default
  3326. variations:
  3327. - __suffix__: aws
  3328. - __suffix__: gce
  3329. env: gce
  3330. frequency: manual
  3331. cluster:
  3332. cluster_env: app_config.yaml
  3333. cluster_compute: 1gpu_16cpus_gce.yaml
  3334. - name: rllib_learning_tests_sac_torch
  3335. group: RLlib tests
  3336. working_dir: rllib_tests
  3337. frequency: nightly
  3338. team: rllib
  3339. cluster:
  3340. cluster_env: app_config.yaml
  3341. cluster_compute: 1gpu_16cpus.yaml
  3342. run:
  3343. timeout: 18000
  3344. script: python learning_tests/run.py --yaml-sub-dir=sac --framework=torch
  3345. alert: default
  3346. variations:
  3347. - __suffix__: aws
  3348. - __suffix__: gce
  3349. env: gce
  3350. frequency: manual
  3351. cluster:
  3352. cluster_env: app_config.yaml
  3353. cluster_compute: 1gpu_16cpus_gce.yaml
  3354. - name: rllib_learning_tests_slateq_tf
  3355. group: RLlib tests
  3356. working_dir: rllib_tests
  3357. frequency: nightly
  3358. team: rllib
  3359. cluster:
  3360. cluster_env: app_config.yaml
  3361. cluster_compute: 1gpu_16cpus.yaml
  3362. run:
  3363. timeout: 18000
  3364. script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=tf
  3365. alert: default
  3366. variations:
  3367. - __suffix__: aws
  3368. - __suffix__: gce
  3369. env: gce
  3370. frequency: manual
  3371. cluster:
  3372. cluster_env: app_config.yaml
  3373. cluster_compute: 1gpu_16cpus_gce.yaml
  3374. - name: rllib_learning_tests_slateq_torch
  3375. group: RLlib tests
  3376. working_dir: rllib_tests
  3377. # Marking as unstable since it's currently expected to fail.
  3378. stable: false
  3379. frequency: nightly
  3380. team: rllib
  3381. cluster:
  3382. cluster_env: app_config.yaml
  3383. cluster_compute: 1gpu_16cpus.yaml
  3384. run:
  3385. timeout: 18000
  3386. script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=torch
  3387. alert: default
  3388. variations:
  3389. - __suffix__: aws
  3390. - __suffix__: gce
  3391. env: gce
  3392. frequency: manual
  3393. cluster:
  3394. cluster_env: app_config.yaml
  3395. cluster_compute: 1gpu_16cpus_gce.yaml
  3396. - name: rllib_learning_tests_td3_tf
  3397. group: RLlib tests
  3398. working_dir: rllib_tests
  3399. frequency: nightly
  3400. team: rllib
  3401. cluster:
  3402. cluster_env: app_config.yaml
  3403. cluster_compute: 1gpu_16cpus.yaml
  3404. run:
  3405. timeout: 18000
  3406. script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=tf
  3407. alert: default
  3408. variations:
  3409. - __suffix__: aws
  3410. - __suffix__: gce
  3411. env: gce
  3412. frequency: manual
  3413. cluster:
  3414. cluster_env: app_config.yaml
  3415. cluster_compute: 1gpu_16cpus_gce.yaml
  3416. - name: rllib_learning_tests_td3_torch
  3417. group: RLlib tests
  3418. working_dir: rllib_tests
  3419. frequency: nightly
  3420. team: rllib
  3421. cluster:
  3422. cluster_env: app_config.yaml
  3423. cluster_compute: 1gpu_16cpus.yaml
  3424. run:
  3425. timeout: 18000
  3426. script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=torch
  3427. alert: default
  3428. variations:
  3429. - __suffix__: aws
  3430. - __suffix__: gce
  3431. env: gce
  3432. frequency: manual
  3433. cluster:
  3434. cluster_env: app_config.yaml
  3435. cluster_compute: 1gpu_16cpus_gce.yaml
  3436. - name: rllib_multi_gpu_learning_tests
  3437. group: RLlib tests
  3438. working_dir: rllib_tests
  3439. frequency: nightly
  3440. team: rllib
  3441. cluster:
  3442. cluster_env: app_config.yaml
  3443. cluster_compute: 8gpus_96cpus.yaml
  3444. run:
  3445. timeout: 7200
  3446. script: python multi_gpu_learning_tests/run.py
  3447. alert: default
  3448. variations:
  3449. - __suffix__: aws
  3450. - __suffix__: gce
  3451. env: gce
  3452. frequency: manual
  3453. cluster:
  3454. cluster_env: app_config.yaml
  3455. cluster_compute: 8gpus_96cpus_gce.yaml
  3456. - name: rllib_multi_gpu_with_lstm_learning_tests
  3457. group: RLlib tests
  3458. working_dir: rllib_tests
  3459. frequency: nightly
  3460. team: rllib
  3461. cluster:
  3462. cluster_env: app_config.yaml
  3463. cluster_compute: 8gpus_96cpus.yaml
  3464. run:
  3465. timeout: 7200
  3466. script: python multi_gpu_with_lstm_learning_tests/run.py
  3467. alert: default
  3468. variations:
  3469. - __suffix__: aws
  3470. - __suffix__: gce
  3471. env: gce
  3472. frequency: manual
  3473. cluster:
  3474. cluster_env: app_config.yaml
  3475. cluster_compute: 8gpus_96cpus_gce.yaml
  3476. - name: rllib_multi_gpu_with_attention_learning_tests
  3477. group: RLlib tests
  3478. working_dir: rllib_tests
  3479. frequency: nightly
  3480. team: rllib
  3481. cluster:
  3482. cluster_env: app_config.yaml
  3483. cluster_compute: 8gpus_96cpus.yaml
  3484. run:
  3485. timeout: 7200
  3486. script: python multi_gpu_with_attention_learning_tests/run.py
  3487. alert: default
  3488. variations:
  3489. - __suffix__: aws
  3490. - __suffix__: gce
  3491. env: gce
  3492. frequency: manual
  3493. cluster:
  3494. # TODO(https://github.com/ray-project/ray/issues/34591)
  3495. # Revert to the comment below once ^ closed.
  3496. # cluster_env: app_config.yaml
  3497. cluster_env: debug_app_config.yaml
  3498. cluster_compute: 8gpus_96cpus_gce.yaml
  3499. - name: rllib_stress_tests
  3500. group: RLlib tests
  3501. working_dir: rllib_tests
  3502. frequency: weekly
  3503. team: rllib
  3504. cluster:
  3505. cluster_env: app_config.yaml
  3506. cluster_compute: 4gpus_544_cpus.yaml
  3507. run:
  3508. timeout: 5400
  3509. script: python stress_tests/run_stress_tests.py
  3510. wait_for_nodes:
  3511. num_nodes: 6
  3512. smoke_test:
  3513. frequency: nightly
  3514. run:
  3515. timeout: 2000
  3516. alert: default
  3517. variations:
  3518. - __suffix__: aws
  3519. - __suffix__: gce
  3520. env: gce
  3521. frequency: manual
  3522. smoke_test:
  3523. frequency: manual
  3524. run:
  3525. timeout: 2000
  3526. cluster:
  3527. cluster_env: app_config.yaml
  3528. cluster_compute: 4gpus_512_cpus_gce.yaml
  3529. ########################
  3530. # Core Nightly Tests
  3531. ########################
  3532. - name: shuffle_100gb
  3533. group: core-multi-test
  3534. working_dir: nightly_tests
  3535. frequency: nightly
  3536. python: "3.8"
  3537. team: core
  3538. cluster:
  3539. byod:
  3540. runtime_env:
  3541. - RAY_worker_killing_policy=retriable_lifo
  3542. cluster_env: shuffle/shuffle_app_config.yaml
  3543. cluster_compute: shuffle/shuffle_compute_multi.yaml
  3544. run:
  3545. timeout: 3000
  3546. script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
  3547. wait_for_nodes:
  3548. num_nodes: 4
  3549. variations:
  3550. - __suffix__: aws
  3551. - __suffix__: gce
  3552. env: gce
  3553. frequency: manual
  3554. cluster:
  3555. cluster_env: shuffle/shuffle_app_config.yaml
  3556. cluster_compute: shuffle/shuffle_compute_multi_gce.yaml
  3557. - name: stress_test_placement_group
  3558. group: core-multi-test
  3559. working_dir: nightly_tests
  3560. frequency: nightly
  3561. python: "3.8"
  3562. team: core
  3563. cluster:
  3564. byod: {}
  3565. cluster_env: stress_tests/stress_tests_app_config.yaml
  3566. cluster_compute: stress_tests/placement_group_tests_compute.yaml
  3567. run:
  3568. timeout: 7200
  3569. script: python stress_tests/test_placement_group.py
  3570. variations:
  3571. - __suffix__: aws
  3572. - __suffix__: gce
  3573. env: gce
  3574. frequency: manual
  3575. cluster:
  3576. cluster_env: stress_tests/stress_tests_app_config.yaml
  3577. cluster_compute: stress_tests/placement_group_tests_compute_gce.yaml
  3578. - name: decision_tree_autoscaling_20_runs
  3579. group: core-multi-test
  3580. working_dir: nightly_tests
  3581. frequency: nightly
  3582. python: "3.8"
  3583. team: core
  3584. cluster:
  3585. byod: {}
  3586. cluster_env: decision_tree/decision_tree_app_config.yaml
  3587. cluster_compute: decision_tree/autoscaling_compute.yaml
  3588. run:
  3589. timeout: 9600
  3590. script: python decision_tree/cart_with_tree.py --concurrency=20
  3591. variations:
  3592. - __suffix__: aws
  3593. - __suffix__: gce
  3594. env: gce
  3595. frequency: manual
  3596. cluster:
  3597. cluster_env: decision_tree/decision_tree_app_config.yaml
  3598. cluster_compute: decision_tree/autoscaling_compute_gce.yaml
  3599. - name: autoscaling_shuffle_1tb_1000_partitions
  3600. group: core-multi-test
  3601. working_dir: nightly_tests
  3602. frequency: nightly
  3603. python: "3.8"
  3604. team: core
  3605. cluster:
  3606. byod:
  3607. runtime_env:
  3608. - RAY_worker_killing_policy=retriable_lifo
  3609. cluster_env: shuffle/shuffle_app_config.yaml
  3610. cluster_compute: shuffle/shuffle_compute_autoscaling.yaml
  3611. run:
  3612. timeout: 4000
  3613. script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
  3614. --no-streaming
  3615. variations:
  3616. - __suffix__: aws
  3617. - __suffix__: gce
  3618. env: gce
  3619. frequency: manual
  3620. cluster:
  3621. cluster_env: shuffle/shuffle_app_config.yaml
  3622. cluster_compute: shuffle/shuffle_compute_autoscaling_gce.yaml
  3623. - name: microbenchmark
  3624. group: core-daily-test
  3625. team: core
  3626. frequency: nightly
  3627. working_dir: microbenchmark
  3628. python: "3.7"
  3629. cluster:
  3630. cluster_env: app_config.yaml
  3631. cluster_compute: tpl_64.yaml
  3632. run:
  3633. timeout: 1800
  3634. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
  3635. variations:
  3636. - __suffix__: aws
  3637. - __suffix__: gce
  3638. env: gce
  3639. frequency: manual
  3640. cluster:
  3641. cluster_env: app_config.yaml
  3642. cluster_compute: tpl_64_gce.yaml
  3643. - name: microbenchmark_38
  3644. group: core-daily-test
  3645. team: core
  3646. frequency: nightly
  3647. working_dir: microbenchmark
  3648. python: "3.8"
  3649. cluster:
  3650. byod: {}
  3651. cluster_env: app_config.yaml
  3652. cluster_compute: tpl_64.yaml
  3653. run:
  3654. timeout: 1800
  3655. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
  3656. variations:
  3657. - __suffix__: aws
  3658. - __suffix__: gce
  3659. env: gce
  3660. frequency: manual
  3661. cluster:
  3662. cluster_env: app_config.yaml
  3663. cluster_compute: tpl_64_gce.yaml
  3664. - name: benchmark_worker_startup
  3665. group: core-daily-test
  3666. team: core
  3667. frequency: nightly
  3668. working_dir: benchmark-worker-startup
  3669. stable: false
  3670. python: "3.9"
  3671. cluster:
  3672. cluster_env: app_config_gpu.yaml
  3673. cluster_compute: only_head_node_1gpu_64cpu.yaml
  3674. run:
  3675. timeout: 7200
  3676. script: python benchmark_worker_startup.py
  3677. --num_cpus_in_cluster 64
  3678. --num_gpus_in_cluster 64
  3679. --num_tasks_or_actors_per_run 64
  3680. --num_measurements_per_configuration 5
  3681. variations:
  3682. - __suffix__: aws
  3683. - __suffix__: gce
  3684. env: gce
  3685. frequency: manual
  3686. cluster:
  3687. cluster_env: app_config_gpu.yaml
  3688. cluster_compute: only_head_node_1gpu_64cpu_gce.yaml
  3689. - name: dask_on_ray_100gb_sort
  3690. group: core-daily-test
  3691. working_dir: nightly_tests
  3692. frequency: nightly
  3693. team: core
  3694. cluster:
  3695. cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
  3696. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml
  3697. run:
  3698. timeout: 7200
  3699. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions
  3700. 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
  3701. variations:
  3702. - __suffix__: aws
  3703. - __suffix__: gce
  3704. env: gce
  3705. frequency: manual
  3706. cluster:
  3707. cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
  3708. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template_gce.yaml
  3709. - name: dask_on_ray_large_scale_test_spilling
  3710. group: core-daily-test
  3711. working_dir: nightly_tests
  3712. frequency: nightly
  3713. team: data
  3714. cluster:
  3715. cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
  3716. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  3717. run:
  3718. timeout: 7200
  3719. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  3720. 70 --error_rate 0 --data_save_path /tmp/ray
  3721. wait_for_nodes:
  3722. num_nodes: 21
  3723. smoke_test:
  3724. frequency: nightly
  3725. cluster:
  3726. app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
  3727. cluster_compute: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
  3728. run:
  3729. timeout: 7200
  3730. script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb
  3731. 70 --error_rate 0 --data_save_path /tmp/ray
  3732. wait_for_nodes:
  3733. num_nodes: 5
  3734. - name: stress_test_state_api_scale
  3735. group: core-daily-test
  3736. working_dir: nightly_tests
  3737. frequency: nightly
  3738. team: core
  3739. cluster:
  3740. cluster_env: stress_tests/state_api_app_config.yaml
  3741. cluster_compute: stress_tests/stress_tests_compute_large.yaml
  3742. run:
  3743. timeout: 3600
  3744. script: python stress_tests/test_state_api_scale.py
  3745. wait_for_nodes:
  3746. num_nodes: 7
  3747. smoke_test:
  3748. frequency: nightly
  3749. cluster:
  3750. app_config: stress_tests/state_api_app_config.yaml
  3751. cluster_compute: stress_tests/smoke_test_compute.yaml
  3752. run:
  3753. timeout: 3600
  3754. wait_for_nodes:
  3755. num_nodes: 5
  3756. script: python stress_tests/test_state_api_scale.py --smoke-test
  3757. variations:
  3758. - __suffix__: aws
  3759. - __suffix__: gce
  3760. env: gce
  3761. frequency: manual
  3762. cluster:
  3763. cluster_env: stress_tests/state_api_app_config.yaml
  3764. cluster_compute: stress_tests/stress_tests_compute_large_gce.yaml
  3765. smoke_test:
  3766. frequency: manual
  3767. - name: shuffle_20gb_with_state_api
  3768. group: core-daily-test
  3769. working_dir: nightly_tests
  3770. frequency: nightly
  3771. team: core
  3772. cluster:
  3773. cluster_env: shuffle/shuffle_with_state_api_app_config.yaml
  3774. cluster_compute: shuffle/shuffle_compute_single.yaml
  3775. run:
  3776. timeout: 1000
  3777. script: python stress_tests/test_state_api_with_other_tests.py
  3778. nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
  3779. variations:
  3780. - __suffix__: aws
  3781. - __suffix__: gce
  3782. env: gce
  3783. frequency: manual
  3784. cluster:
  3785. cluster_env: shuffle/shuffle_with_state_api_app_config.yaml
  3786. cluster_compute: shuffle/shuffle_compute_single_gce.yaml
  3787. - name: stress_test_many_tasks
  3788. group: core-daily-test
  3789. working_dir: nightly_tests
  3790. frequency: nightly
  3791. team: core
  3792. cluster:
  3793. cluster_env: stress_tests/stress_tests_app_config.yaml
  3794. cluster_compute: stress_tests/stress_tests_compute.yaml
  3795. run:
  3796. timeout: 14400
  3797. wait_for_nodes:
  3798. num_nodes: 101
  3799. script: python stress_tests/test_many_tasks.py
  3800. smoke_test:
  3801. frequency: nightly
  3802. cluster:
  3803. app_config: stress_tests/stress_tests_app_config.yaml
  3804. cluster_compute: stress_tests/smoke_test_compute.yaml
  3805. run:
  3806. timeout: 3600
  3807. wait_for_nodes:
  3808. num_nodes: 5
  3809. script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
  3810. variations:
  3811. - __suffix__: aws
  3812. - __suffix__: gce
  3813. env: gce
  3814. frequency: manual
  3815. cluster:
  3816. cluster_env: stress_tests/stress_tests_app_config.yaml
  3817. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  3818. smoke_test:
  3819. frequency: manual
  3820. - name: stress_test_dead_actors
  3821. group: core-daily-test
  3822. working_dir: nightly_tests
  3823. frequency: nightly
  3824. team: core
  3825. cluster:
  3826. cluster_env: stress_tests/stress_tests_app_config.yaml
  3827. cluster_compute: stress_tests/stress_tests_compute.yaml
  3828. run:
  3829. timeout: 7200
  3830. wait_for_nodes:
  3831. num_nodes: 101
  3832. script: python stress_tests/test_dead_actors.py
  3833. smoke_test:
  3834. frequency: nightly
  3835. cluster:
  3836. app_config: stress_tests/stress_tests_app_config.yaml
  3837. cluster_compute: stress_tests/smoke_test_compute.yaml
  3838. run:
  3839. timeout: 3600
  3840. wait_for_nodes:
  3841. num_nodes: 5
  3842. script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3
  3843. --num-children=3
  3844. variations:
  3845. - __suffix__: aws
  3846. - __suffix__: gce
  3847. env: gce
  3848. frequency: manual
  3849. cluster:
  3850. cluster_env: stress_tests/stress_tests_app_config.yaml
  3851. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  3852. smoke_test:
  3853. frequency: manual
  3854. # The full test is not stable, so run the smoke test only.
  3855. # See https://github.com/ray-project/ray/issues/23244.
  3856. - name: threaded_actors_stress_test
  3857. group: core-daily-test
  3858. working_dir: nightly_tests
  3859. frequency: nightly
  3860. team: core
  3861. cluster:
  3862. cluster_env: stress_tests/stress_tests_app_config.yaml
  3863. cluster_compute: stress_tests/smoke_test_compute.yaml
  3864. run:
  3865. timeout: 3600
  3866. script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  3867. 30
  3868. wait_for_nodes:
  3869. num_nodes: 5
  3870. variations:
  3871. - __suffix__: aws
  3872. - __suffix__: gce
  3873. env: gce
  3874. frequency: manual
  3875. cluster:
  3876. cluster_env: stress_tests/stress_tests_app_config.yaml
  3877. cluster_compute: stress_tests/smoke_test_compute_gce.yaml
  3878. # - name: threaded_actors_stress_test
  3879. # group: core-daily-test
  3880. # working_dir: nightly_tests
  3881. #
  3882. # frequency: nightly
  3883. # team: core
  3884. # cluster:
  3885. # cluster_env: stress_tests/stress_tests_app_config.yaml
  3886. # cluster_compute: stress_tests/stress_test_threaded_actor_compute.yaml
  3887. #
  3888. # run:
  3889. # timeout: 7200
  3890. # script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s
  3891. # 60
  3892. #
  3893. # wait_for_nodes:
  3894. # num_nodes: 201
  3895. # timeout: 600
  3896. #
  3897. # smoke_test:
  3898. # frequency: nightly
  3899. # cluster:
  3900. # app_config: stress_tests/stress_tests_app_config.yaml
  3901. # cluster_compute: stress_tests/smoke_test_compute.yaml
  3902. #
  3903. # run:
  3904. # timeout: 3600
  3905. # script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  3906. # 30
  3907. #
  3908. # wait_for_nodes:
  3909. # num_nodes: 5
  3910. # timeout: 600
  3911. - name: single_node_oom
  3912. group: core-daily-test
  3913. working_dir: nightly_tests
  3914. frequency: nightly
  3915. team: core
  3916. cluster:
  3917. cluster_env: stress_tests/stress_tests_single_node_oom_app_config.yaml
  3918. cluster_compute: stress_tests/stress_tests_single_node_oom_compute.yaml
  3919. run:
  3920. timeout: 500
  3921. script: python stress_tests/test_parallel_tasks_memory_pressure.py --num-tasks 20
  3922. variations:
  3923. - __suffix__: aws
  3924. - __suffix__: gce
  3925. env: gce
  3926. frequency: manual
  3927. cluster:
  3928. cluster_env: stress_tests/stress_tests_single_node_oom_app_config.yaml
  3929. cluster_compute: stress_tests/stress_tests_single_node_oom_compute_gce.yaml
  3930. - name: tune_air_oom
  3931. group: core-daily-test
  3932. working_dir: air_tests
  3933. stable: false
  3934. jailed: true
  3935. frequency: nightly
  3936. team: core
  3937. cluster:
  3938. cluster_env: oom/stress_tests_tune_air_oom_app_config.yaml
  3939. cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml
  3940. run:
  3941. timeout: 3600
  3942. script: bash oom/tune_air_oom.sh
  3943. - name: dask_on_ray_1tb_sort
  3944. group: core-daily-test
  3945. working_dir: nightly_tests
  3946. frequency: nightly-3x
  3947. team: core
  3948. cluster:
  3949. cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
  3950. cluster_compute: dask_on_ray/1tb_sort_compute.yaml
  3951. run:
  3952. timeout: 7200
  3953. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions
  3954. 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
  3955. wait_for_nodes:
  3956. num_nodes: 32
  3957. - name: many_nodes_actor_test_on_v2
  3958. group: core-daily-test
  3959. working_dir: benchmarks
  3960. frequency: nightly-3x
  3961. team: core
  3962. cluster:
  3963. cluster_env: distributed/many_nodes_tests/app_config.yaml
  3964. cluster_compute: distributed/many_nodes_tests/compute_config.yaml
  3965. run:
  3966. timeout: 3600
  3967. # 2cpus per node x 1000 nodes / 0.2 cpus per actor = 10k
  3968. # 2cpus per node x 2000 nodes / 0.2 cpus per actor = 20k
  3969. script: python distributed/many_nodes_tests/actor_test.py --no-wait --cpus-per-actor=0.2 --total-actors 10000 20000
  3970. wait_for_nodes:
  3971. num_nodes: 500
  3972. variations:
  3973. - __suffix__: aws
  3974. - __suffix__: gce
  3975. env: gce
  3976. frequency: manual
  3977. cluster:
  3978. cluster_env: distributed/many_nodes_tests/app_config.yaml
  3979. cluster_compute: distributed/many_nodes_tests/compute_config_gce.yaml
  3980. #- name: many_nodes_multi_master_test
  3981. # group: core-daily-test
  3982. # working_dir: nightly_tests
  3983. #
  3984. # frequency: nightly-3x
  3985. # team: core
  3986. # cluster:
  3987. # cluster_env: many_nodes_tests/app_config.yaml
  3988. # cluster_compute: many_nodes_tests/compute_config.yaml
  3989. #
  3990. # run:
  3991. # timeout: 7200
  3992. # script: python many_nodes_tests/multi_master_test.py
  3993. # wait_for_nodes:
  3994. # num_nodes: 251
  3995. #
  3996. - name: pg_autoscaling_regression_test
  3997. group: core-daily-test
  3998. working_dir: nightly_tests
  3999. frequency: nightly
  4000. team: core
  4001. cluster:
  4002. cluster_env: placement_group_tests/app_config.yaml
  4003. cluster_compute: placement_group_tests/compute.yaml
  4004. run:
  4005. timeout: 1200
  4006. script: python placement_group_tests/pg_run.py
  4007. variations:
  4008. - __suffix__: aws
  4009. - __suffix__: gce
  4010. env: gce
  4011. frequency: manual
  4012. cluster:
  4013. cluster_env: placement_group_tests/app_config.yaml
  4014. cluster_compute: placement_group_tests/compute_gce.yaml
  4015. - name: placement_group_performance_test
  4016. group: core-daily-test
  4017. working_dir: nightly_tests
  4018. frequency: nightly
  4019. team: core
  4020. cluster:
  4021. cluster_env: placement_group_tests/app_config.yaml
  4022. cluster_compute: placement_group_tests/pg_perf_test_compute.yaml
  4023. run:
  4024. timeout: 1200
  4025. script: python placement_group_tests/placement_group_performance_test.py
  4026. wait_for_nodes:
  4027. num_nodes: 5
  4028. variations:
  4029. - __suffix__: aws
  4030. - __suffix__: gce
  4031. env: gce
  4032. frequency: manual
  4033. cluster:
  4034. cluster_env: placement_group_tests/app_config.yaml
  4035. cluster_compute: placement_group_tests/pg_perf_test_compute_gce.yaml
  4036. #########################
  4037. # Core Scalability Tests
  4038. #########################
  4039. - name: single_node
  4040. group: core-scalability-test
  4041. working_dir: benchmarks
  4042. frequency: nightly
  4043. team: core
  4044. cluster:
  4045. cluster_env: app_config.yaml
  4046. cluster_compute: single_node.yaml
  4047. run:
  4048. timeout: 12000
  4049. prepare: sleep 0
  4050. script: python single_node/test_single_node.py
  4051. variations:
  4052. - __suffix__: aws
  4053. - __suffix__: gce
  4054. env: gce
  4055. frequency: manual
  4056. cluster:
  4057. cluster_env: app_config.yaml
  4058. cluster_compute: single_node_gce.yaml
  4059. - name: object_store
  4060. group: core-scalability-test
  4061. working_dir: benchmarks
  4062. frequency: nightly
  4063. team: core
  4064. cluster:
  4065. cluster_env: app_config.yaml
  4066. cluster_compute: object_store.yaml
  4067. run:
  4068. timeout: 3600
  4069. script: python object_store/test_object_store.py
  4070. wait_for_nodes:
  4071. num_nodes: 50
  4072. variations:
  4073. - __suffix__: aws
  4074. - __suffix__: gce
  4075. env: gce
  4076. frequency: manual
  4077. cluster:
  4078. cluster_env: app_config.yaml
  4079. cluster_compute: object_store_gce.yaml
  4080. - name: many_actors
  4081. group: core-scalability-test
  4082. working_dir: benchmarks
  4083. frequency: nightly-3x
  4084. team: core
  4085. cluster:
  4086. cluster_env: app_config.yaml
  4087. cluster_compute: distributed.yaml
  4088. run:
  4089. timeout: 3600
  4090. script: python distributed/test_many_actors.py
  4091. wait_for_nodes:
  4092. num_nodes: 65
  4093. variations:
  4094. - __suffix__: aws
  4095. - __suffix__: gce
  4096. env: gce
  4097. frequency: manual
  4098. cluster:
  4099. cluster_env: app_config.yaml
  4100. cluster_compute: distributed_gce.yaml
  4101. - name: many_actors_smoke_test
  4102. group: core-scalability-test
  4103. working_dir: benchmarks
  4104. frequency: nightly
  4105. team: core
  4106. cluster:
  4107. cluster_env: app_config.yaml
  4108. cluster_compute: distributed_smoke_test.yaml
  4109. run:
  4110. timeout: 3600
  4111. script: SMOKE_TEST=1 python distributed/test_many_actors.py
  4112. wait_for_nodes:
  4113. num_nodes: 2
  4114. - name: many_tasks
  4115. group: core-scalability-test
  4116. working_dir: benchmarks
  4117. frequency: nightly
  4118. team: core
  4119. cluster:
  4120. cluster_env: app_config.yaml
  4121. cluster_compute: distributed.yaml
  4122. run:
  4123. timeout: 3600
  4124. script: python distributed/test_many_tasks.py --num-tasks=10000
  4125. wait_for_nodes:
  4126. num_nodes: 65
  4127. variations:
  4128. - __suffix__: aws
  4129. - __suffix__: gce
  4130. env: gce
  4131. frequency: manual
  4132. cluster:
  4133. cluster_env: app_config.yaml
  4134. cluster_compute: distributed_gce.yaml
  4135. - name: many_pgs
  4136. group: core-scalability-test
  4137. working_dir: benchmarks
  4138. frequency: nightly-3x
  4139. team: core
  4140. cluster:
  4141. cluster_env: app_config.yaml
  4142. cluster_compute: distributed.yaml
  4143. run:
  4144. timeout: 3600
  4145. script: python distributed/test_many_pgs.py
  4146. wait_for_nodes:
  4147. num_nodes: 65
  4148. variations:
  4149. - __suffix__: aws
  4150. - __suffix__: gce
  4151. env: gce
  4152. frequency: manual
  4153. cluster:
  4154. cluster_env: app_config.yaml
  4155. cluster_compute: distributed_gce.yaml
  4156. - name: many_pgs_smoke_test
  4157. group: core-scalability-test
  4158. working_dir: benchmarks
  4159. frequency: nightly
  4160. team: core
  4161. cluster:
  4162. cluster_env: app_config.yaml
  4163. cluster_compute: distributed_smoke_test.yaml
  4164. run:
  4165. timeout: 3600
  4166. script: SMOKE_TEST=1 python distributed/test_many_pgs.py
  4167. wait_for_nodes:
  4168. num_nodes: 2
  4169. - name: many_nodes
  4170. group: core-scalability-test
  4171. working_dir: benchmarks
  4172. frequency: nightly-3x
  4173. team: core
  4174. cluster:
  4175. cluster_env: app_config.yaml
  4176. cluster_compute: many_nodes.yaml
  4177. run:
  4178. timeout: 3600
  4179. script: python distributed/test_many_tasks.py --num-tasks=1000
  4180. wait_for_nodes:
  4181. num_nodes: 250
  4182. variations:
  4183. - __suffix__: aws
  4184. - __suffix__: gce
  4185. env: gce
  4186. frequency: manual
  4187. cluster:
  4188. cluster_env: app_config.yaml
  4189. cluster_compute: many_nodes_gce.yaml
  4190. - name: scheduling_test_many_0s_tasks_many_nodes
  4191. group: core-scalability-test
  4192. working_dir: benchmarks
  4193. frequency: nightly
  4194. team: core
  4195. cluster:
  4196. cluster_env: app_config.yaml
  4197. cluster_compute: scheduling.yaml
  4198. run:
  4199. timeout: 3600
  4200. script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  4201. --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
  4202. wait_for_nodes:
  4203. num_nodes: 32
  4204. variations:
  4205. - __suffix__: aws
  4206. - __suffix__: gce
  4207. env: gce
  4208. frequency: manual
  4209. cluster:
  4210. cluster_env: app_config.yaml
  4211. cluster_compute: scheduling_gce.yaml
  4212. # - name: scheduling_test_many_5s_tasks_single_node
  4213. # group: core-scalability-test
  4214. # working_dir: benchmarks
  4215. # frequency: nightly
  4216. # team: core
  4217. # cluster:
  4218. # cluster_env: app_config.yaml
  4219. # cluster_compute: scheduling.yaml
  4220. # run:
  4221. # timeout: 3600
  4222. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  4223. # --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
  4224. # wait_for_nodes:
  4225. # num_nodes: 32
  4226. # timeout: 600
  4227. # stable: false
  4228. # - name: scheduling_test_many_5s_tasks_many_nodes
  4229. # group: core-scalability-test
  4230. # working_dir: benchmarks
  4231. # frequency: nightly
  4232. # team: core
  4233. # cluster:
  4234. # cluster_env: app_config.yaml
  4235. # cluster_compute: scheduling.yaml
  4236. # run:
  4237. # timeout: 3600
  4238. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  4239. # --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
  4240. # wait_for_nodes:
  4241. # num_nodes: 32
  4242. # timeout: 600
  4243. # stable: false
  4244. ###############
  4245. # Dataset tests
  4246. ###############
  4247. - name: inference
  4248. group: data-tests
  4249. working_dir: nightly_tests/dataset
  4250. frequency: nightly
  4251. team: data
  4252. cluster:
  4253. cluster_env: app_config.yaml
  4254. cluster_compute: inference.yaml
  4255. run:
  4256. timeout: 600
  4257. script: python inference.py
  4258. wait_for_nodes:
  4259. num_nodes: 2
  4260. variations:
  4261. - __suffix__: aws
  4262. - __suffix__: gce
  4263. env: gce
  4264. frequency: manual
  4265. cluster:
  4266. cluster_env: app_config.yaml
  4267. cluster_compute: inference_gce.yaml
  4268. - name: shuffle_data_loader
  4269. group: data-tests
  4270. working_dir: nightly_tests/dataset
  4271. frequency: nightly
  4272. team: data
  4273. cluster:
  4274. cluster_env: shuffle_app_config.yaml
  4275. cluster_compute: shuffle_compute.yaml
  4276. run:
  4277. timeout: 1800
  4278. script: python dataset_shuffle_data_loader.py --cloud aws
  4279. variations:
  4280. - __suffix__: aws
  4281. - __suffix__: gce
  4282. env: gce
  4283. frequency: manual
  4284. cluster:
  4285. cluster_compute: shuffle_compute_gce.yaml
  4286. run:
  4287. script: python dataset_shuffle_data_loader.py --cloud gcp
  4288. - name: parquet_metadata_resolution
  4289. group: data-tests
  4290. working_dir: nightly_tests/dataset
  4291. frequency: nightly
  4292. team: data
  4293. cluster:
  4294. cluster_env: app_config.yaml
  4295. cluster_compute: single_node_benchmark_compute.yaml
  4296. run:
  4297. # Expect the test to finish around 40 seconds.
  4298. timeout: 100
  4299. script: python parquet_metadata_resolution.py --num-files 915 --cloud aws
  4300. variations:
  4301. - __suffix__: aws
  4302. - __suffix__: gce
  4303. env: gce
  4304. frequency: manual
  4305. cluster:
  4306. cluster_compute: single_node_benchmark_compute_gce.yaml
  4307. run:
  4308. script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp
  4309. - name: dataset_random_access
  4310. group: data-tests
  4311. working_dir: nightly_tests/dataset
  4312. stable: false
  4313. frequency: nightly
  4314. team: data
  4315. cluster:
  4316. cluster_env: pipelined_training_app.yaml
  4317. cluster_compute: pipelined_training_compute.yaml
  4318. run:
  4319. timeout: 1200
  4320. script: python dataset_random_access.py
  4321. wait_for_nodes:
  4322. num_nodes: 15
  4323. variations:
  4324. - __suffix__: aws
  4325. - __suffix__: gce
  4326. env: gce
  4327. frequency: manual
  4328. cluster:
  4329. cluster_env: pipelined_training_app.yaml
  4330. cluster_compute: pipelined_training_compute_gce.yaml
  4331. - name: pipelined_data_ingest_benchmark_1tb
  4332. group: data-tests
  4333. working_dir: nightly_tests/dataset
  4334. frequency: nightly
  4335. team: data
  4336. cluster:
  4337. cluster_env: app_config.yaml
  4338. cluster_compute: data_ingest_benchmark_compute.yaml
  4339. run:
  4340. timeout: 300
  4341. script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming
  4342. wait_for_nodes:
  4343. num_nodes: 20
  4344. variations:
  4345. - __suffix__: aws
  4346. - __suffix__: gce
  4347. env: gce
  4348. frequency: manual
  4349. cluster:
  4350. cluster_env: app_config.yaml
  4351. cluster_compute: data_ingest_benchmark_compute_gce.yaml
  4352. - name: streaming_data_ingest_benchmark_1tb
  4353. group: data-tests
  4354. working_dir: nightly_tests/dataset
  4355. frequency: nightly
  4356. team: data
  4357. cluster:
  4358. cluster_env: app_config.yaml
  4359. cluster_compute: data_ingest_benchmark_compute.yaml
  4360. run:
  4361. timeout: 300
  4362. script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --new_streaming
  4363. wait_for_nodes:
  4364. num_nodes: 20
  4365. variations:
  4366. - __suffix__: aws
  4367. - __suffix__: gce
  4368. env: gce
  4369. frequency: manual
  4370. cluster:
  4371. cluster_env: app_config.yaml
  4372. cluster_compute: data_ingest_benchmark_compute_gce.yaml
  4373. - name: streaming_data_ingest_benchmark_100gb_gpu
  4374. group: data-tests
  4375. working_dir: nightly_tests/dataset
  4376. frequency: nightly
  4377. team: data
  4378. cluster:
  4379. cluster_env: app_config.yaml
  4380. cluster_compute: data_ingest_benchmark_compute_gpu.yaml
  4381. run:
  4382. timeout: 300
  4383. script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --new_streaming --use-gpu
  4384. wait_for_nodes:
  4385. num_nodes: 3
  4386. variations:
  4387. - __suffix__: aws
  4388. - __suffix__: gce
  4389. env: gce
  4390. frequency: manual
  4391. cluster:
  4392. cluster_env: app_config.yaml
  4393. cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
  4394. # This test case will early stop the data ingestion iteration on the GPU actors.
  4395. # This is a common usage in PyTorch Lightning
  4396. # (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches).
  4397. # There was a bug in Ray Data that caused GPU memoy leak (see #34819).
  4398. # We add this test case to cover this scenario.
  4399. - name: streaming_data_ingest_benchmark_100gb_gpu_early_stop
  4400. group: data-tests
  4401. working_dir: nightly_tests/dataset
  4402. frequency: nightly
  4403. team: data
  4404. cluster:
  4405. cluster_env: app_config.yaml
  4406. cluster_compute: data_ingest_benchmark_compute_gpu.yaml
  4407. run:
  4408. timeout: 300
  4409. script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --new_streaming --use-gpu --early-stop
  4410. wait_for_nodes:
  4411. num_nodes: 3
  4412. variations:
  4413. - __suffix__: aws
  4414. - __suffix__: gce
  4415. env: gce
  4416. frequency: manual
  4417. cluster:
  4418. cluster_env: app_config.yaml
  4419. cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
  4420. - name: aggregate_benchmark
  4421. group: data-tests
  4422. working_dir: nightly_tests/dataset
  4423. frequency: nightly
  4424. team: data
  4425. cluster:
  4426. cluster_env: app_config.yaml
  4427. cluster_compute: single_node_benchmark_compute.yaml
  4428. run:
  4429. timeout: 1800
  4430. script: python aggregate_benchmark.py
  4431. variations:
  4432. - __suffix__: aws
  4433. - __suffix__: gce
  4434. env: gce
  4435. frequency: manual
  4436. cluster:
  4437. cluster_env: app_config.yaml
  4438. cluster_compute: single_node_benchmark_compute_gce.yaml
  4439. - name: read_parquet_benchmark_single_node
  4440. group: data-tests
  4441. working_dir: nightly_tests/dataset
  4442. frequency: nightly
  4443. team: data
  4444. cluster:
  4445. cluster_env: app_config.yaml
  4446. cluster_compute: single_node_benchmark_compute.yaml
  4447. run:
  4448. # Expect the benchmark to finish in 400 seconds.
  4449. timeout: 400
  4450. script: python read_parquet_benchmark.py
  4451. variations:
  4452. - __suffix__: aws
  4453. - __suffix__: gce
  4454. env: gce
  4455. frequency: manual
  4456. cluster:
  4457. cluster_env: app_config.yaml
  4458. cluster_compute: single_node_benchmark_compute_gce.yaml
  4459. - name: read_images_benchmark_single_node
  4460. group: data-tests
  4461. working_dir: nightly_tests/dataset
  4462. frequency: nightly
  4463. team: data
  4464. cluster:
  4465. cluster_env: app_config.yaml
  4466. cluster_compute: single_node_benchmark_compute.yaml
  4467. run:
  4468. timeout: 1800
  4469. script: python read_images_benchmark.py
  4470. variations:
  4471. - __suffix__: aws
  4472. - __suffix__: gce
  4473. env: gce
  4474. frequency: manual
  4475. cluster:
  4476. cluster_env: app_config.yaml
  4477. cluster_compute: single_node_benchmark_compute_gce.yaml
  4478. - name: read_tfrecords_benchmark_single_node
  4479. group: data-tests
  4480. working_dir: nightly_tests/dataset
  4481. frequency: nightly
  4482. team: data
  4483. cluster:
  4484. cluster_env: read_tfrecords_benchmark_app.yaml
  4485. cluster_compute: single_node_benchmark_compute.yaml
  4486. run:
  4487. # Expect the benchmark to finish around 22 minutes.
  4488. timeout: 1800
  4489. script: python read_tfrecords_benchmark.py
  4490. variations:
  4491. - __suffix__: aws
  4492. - __suffix__: gce
  4493. env: gce
  4494. frequency: manual
  4495. cluster:
  4496. cluster_env: read_tfrecords_benchmark_app.yaml
  4497. cluster_compute: single_node_benchmark_compute_gce.yaml
  4498. - name: map_batches_benchmark_single_node
  4499. group: data-tests
  4500. working_dir: nightly_tests/dataset
  4501. frequency: nightly
  4502. team: data
  4503. cluster:
  4504. cluster_env: app_config.yaml
  4505. cluster_compute: single_node_benchmark_compute.yaml
  4506. run:
  4507. # Expect the benchmark to finish around 30 minutes.
  4508. timeout: 2400
  4509. script: python map_batches_benchmark.py
  4510. variations:
  4511. - __suffix__: aws
  4512. - __suffix__: gce
  4513. env: gce
  4514. frequency: manual
  4515. cluster:
  4516. cluster_env: app_config.yaml
  4517. cluster_compute: single_node_benchmark_compute_gce.yaml
  4518. - name: iter_tensor_batches_benchmark_single_node
  4519. group: data-tests
  4520. working_dir: nightly_tests/dataset
  4521. frequency: nightly
  4522. team: data
  4523. cluster:
  4524. cluster_env: app_config.yaml
  4525. cluster_compute: single_node_benchmark_compute.yaml
  4526. run:
  4527. # Expect the benchmark to finish around 30 minutes.
  4528. timeout: 2400
  4529. script: python iter_tensor_batches_benchmark.py
  4530. variations:
  4531. - __suffix__: aws
  4532. - __suffix__: gce
  4533. env: gce
  4534. frequency: manual
  4535. cluster:
  4536. cluster_env: app_config.yaml
  4537. cluster_compute: single_node_benchmark_compute_gce.yaml
  4538. - name: iter_tensor_batches_benchmark_multi_node
  4539. group: data-tests
  4540. working_dir: nightly_tests/dataset
  4541. frequency: nightly
  4542. team: data
  4543. cluster:
  4544. cluster_env: app_config.yaml
  4545. cluster_compute: multi_node_benchmark_compute.yaml
  4546. run:
  4547. # Expect the benchmark to finish around 30 minutes.
  4548. timeout: 2400
  4549. script: python iter_tensor_batches_benchmark.py --data-size-gb=10
  4550. variations:
  4551. - __suffix__: aws
  4552. - __suffix__: gce
  4553. env: gce
  4554. frequency: manual
  4555. cluster:
  4556. cluster_env: app_config.yaml
  4557. cluster_compute: multi_node_benchmark_compute_gce.yaml
  4558. - name: iter_batches_benchmark_single_node
  4559. group: data-tests
  4560. working_dir: nightly_tests/dataset
  4561. frequency: nightly
  4562. team: data
  4563. cluster:
  4564. cluster_env: app_config.yaml
  4565. cluster_compute: single_node_benchmark_compute.yaml
  4566. run:
  4567. # Expect the benchmark to finish around 12 minutes.
  4568. timeout: 1080
  4569. script: python iter_batches_benchmark.py
  4570. variations:
  4571. - __suffix__: aws
  4572. - __suffix__: gce
  4573. env: gce
  4574. frequency: manual
  4575. cluster:
  4576. cluster_env: app_config.yaml
  4577. cluster_compute: single_node_benchmark_compute_gce.yaml
  4578. - name: pipelined_training_50_gb
  4579. group: data-tests
  4580. working_dir: nightly_tests/dataset
  4581. frequency: nightly
  4582. team: data
  4583. cluster:
  4584. cluster_env: pipelined_training_app.yaml
  4585. cluster_compute: pipelined_training_compute.yaml
  4586. run:
  4587. timeout: 4800
  4588. script: python pipelined_training.py --epochs 1
  4589. wait_for_nodes:
  4590. num_nodes: 15
  4591. variations:
  4592. - __suffix__: aws
  4593. - __suffix__: gce
  4594. env: gce
  4595. frequency: manual
  4596. cluster:
  4597. cluster_env: pipelined_training_app.yaml
  4598. cluster_compute: pipelined_training_compute_gce.yaml
  4599. - name: pipelined_ingestion_1500_gb
  4600. group: data-tests
  4601. working_dir: nightly_tests/dataset
  4602. frequency: nightly
  4603. team: data
  4604. cluster:
  4605. cluster_env: pipelined_ingestion_app.yaml
  4606. cluster_compute: pipelined_ingestion_compute.yaml
  4607. run:
  4608. timeout: 9600
  4609. script: python pipelined_training.py --epochs 2 --num-windows 5 --num-files 915
  4610. --debug
  4611. wait_for_nodes:
  4612. num_nodes: 21
  4613. variations:
  4614. - __suffix__: aws
  4615. - __suffix__: gce
  4616. env: gce
  4617. frequency: manual
  4618. cluster:
  4619. cluster_env: pipelined_training_app.yaml
  4620. cluster_compute: pipelined_training_compute_gce.yaml
  4621. - name: dataset_shuffle_random_shuffle_1tb
  4622. group: data-tests
  4623. working_dir: nightly_tests
  4624. frequency: nightly
  4625. team: data
  4626. cluster:
  4627. cluster_env: shuffle/shuffle_app_config.yaml
  4628. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4629. run:
  4630. timeout: 7200
  4631. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4632. wait_for_nodes:
  4633. num_nodes: 20
  4634. variations:
  4635. - __suffix__: aws
  4636. - __suffix__: gce
  4637. env: gce
  4638. frequency: manual
  4639. cluster:
  4640. cluster_env: shuffle/shuffle_app_config.yaml
  4641. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4642. - name: dataset_shuffle_sort_1tb
  4643. group: data-tests
  4644. working_dir: nightly_tests
  4645. stable: false
  4646. frequency: nightly
  4647. team: data
  4648. cluster:
  4649. cluster_env: shuffle/shuffle_app_config.yaml
  4650. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4651. run:
  4652. timeout: 7200
  4653. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4654. wait_for_nodes:
  4655. num_nodes: 20
  4656. variations:
  4657. - __suffix__: aws
  4658. - __suffix__: gce
  4659. env: gce
  4660. frequency: manual
  4661. cluster:
  4662. cluster_env: shuffle/shuffle_app_config.yaml
  4663. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4664. - name: dataset_shuffle_push_based_random_shuffle_1tb
  4665. group: data-tests
  4666. working_dir: nightly_tests
  4667. frequency: nightly
  4668. team: data
  4669. cluster:
  4670. cluster_env: shuffle/shuffle_app_config.yaml
  4671. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4672. run:
  4673. timeout: 7200
  4674. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4675. wait_for_nodes:
  4676. num_nodes: 20
  4677. variations:
  4678. - __suffix__: aws
  4679. - __suffix__: gce
  4680. env: gce
  4681. frequency: manual
  4682. cluster:
  4683. cluster_env: shuffle/shuffle_app_config.yaml
  4684. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4685. - name: dataset_shuffle_push_based_sort_1tb
  4686. group: data-tests
  4687. working_dir: nightly_tests
  4688. frequency: nightly
  4689. team: data
  4690. cluster:
  4691. cluster_env: shuffle/shuffle_app_config.yaml
  4692. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4693. run:
  4694. timeout: 7200
  4695. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4696. wait_for_nodes:
  4697. num_nodes: 20
  4698. variations:
  4699. - __suffix__: aws
  4700. - __suffix__: gce
  4701. env: gce
  4702. frequency: manual
  4703. cluster:
  4704. cluster_env: shuffle/shuffle_app_config.yaml
  4705. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4706. - name: dataset_shuffle_push_based_random_shuffle_100tb
  4707. group: data-tests
  4708. working_dir: nightly_tests
  4709. frequency: weekly
  4710. team: data
  4711. cluster:
  4712. cluster_env: shuffle/100tb_shuffle_app_config.yaml
  4713. cluster_compute: shuffle/100tb_shuffle_compute.yaml
  4714. run:
  4715. timeout: 28800
  4716. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle
  4717. wait_for_nodes:
  4718. num_nodes: 100
  4719. variations:
  4720. - __suffix__: aws
  4721. - __suffix__: gce
  4722. env: gce
  4723. frequency: manual
  4724. cluster:
  4725. cluster_env: shuffle/100tb_shuffle_app_config_gce.yaml
  4726. cluster_compute: shuffle/100tb_shuffle_compute_gce.yaml
  4727. run:
  4728. timeout: 28800
  4729. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=40000 --partition-size=1e9 --shuffle
  4730. wait_for_nodes:
  4731. num_nodes: 100
  4732. ##################
  4733. # Core Chaos tests
  4734. ##################
  4735. - name: chaos_many_tasks_no_object_store
  4736. group: core-nightly-test
  4737. working_dir: nightly_tests
  4738. frequency: nightly
  4739. team: core
  4740. cluster:
  4741. cluster_env: chaos_test/app_config.yaml
  4742. cluster_compute: chaos_test/compute_template.yaml
  4743. run:
  4744. timeout: 3600
  4745. wait_for_nodes:
  4746. num_nodes: 10
  4747. prepare: python setup_chaos.py --no-start
  4748. script: python chaos_test/test_chaos_basic.py --workload=tasks
  4749. variations:
  4750. - __suffix__: aws
  4751. - __suffix__: gce
  4752. env: gce
  4753. frequency: manual
  4754. cluster:
  4755. cluster_env: chaos_test/app_config.yaml
  4756. cluster_compute: chaos_test/compute_template_gce.yaml
  4757. - name: chaos_many_actors
  4758. group: core-nightly-test
  4759. working_dir: nightly_tests
  4760. frequency: nightly
  4761. team: core
  4762. cluster:
  4763. cluster_env: chaos_test/app_config.yaml
  4764. cluster_compute: chaos_test/compute_template.yaml
  4765. run:
  4766. timeout: 4200
  4767. wait_for_nodes:
  4768. num_nodes: 10
  4769. prepare: python setup_chaos.py --no-start
  4770. script: python chaos_test/test_chaos_basic.py --workload=actors
  4771. variations:
  4772. - __suffix__: aws
  4773. - __suffix__: gce
  4774. env: gce
  4775. frequency: manual
  4776. cluster:
  4777. cluster_env: chaos_test/app_config.yaml
  4778. cluster_compute: chaos_test/compute_template_gce.yaml
  4779. - name: chaos_dask_on_ray_large_scale_test_no_spilling
  4780. group: data-tests
  4781. working_dir: nightly_tests
  4782. frequency: nightly
  4783. team: data
  4784. cluster:
  4785. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4786. cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
  4787. run:
  4788. timeout: 7200
  4789. wait_for_nodes:
  4790. num_nodes: 21
  4791. prepare: python setup_chaos.py --node-kill-interval 100
  4792. script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
  4793. 20 --error_rate 0 --data_save_path /tmp/ray
  4794. variations:
  4795. - __suffix__: aws
  4796. - __suffix__: gce
  4797. env: gce
  4798. frequency: manual
  4799. cluster:
  4800. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4801. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  4802. - name: chaos_dask_on_ray_large_scale_test_spilling
  4803. group: data-tests
  4804. working_dir: nightly_tests
  4805. frequency: nightly
  4806. team: data
  4807. cluster:
  4808. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4809. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  4810. run:
  4811. timeout: 7200
  4812. wait_for_nodes:
  4813. num_nodes: 21
  4814. prepare: python setup_chaos.py --node-kill-interval 100
  4815. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  4816. 70 --error_rate 0 --data_save_path /tmp/ray
  4817. variations:
  4818. - __suffix__: aws
  4819. - __suffix__: gce
  4820. env: gce
  4821. frequency: manual
  4822. cluster:
  4823. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4824. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  4825. - name: chaos_pipelined_ingestion_1500_gb_15_windows
  4826. group: data-tests
  4827. working_dir: nightly_tests
  4828. stable: false
  4829. frequency: nightly
  4830. team: data
  4831. cluster:
  4832. cluster_env: dataset/pipelined_ingestion_app.yaml
  4833. cluster_compute: dataset/pipelined_ingestion_compute.yaml
  4834. run:
  4835. timeout: 7200
  4836. wait_for_nodes:
  4837. num_nodes: 21
  4838. prepare: ' python setup_chaos.py --node-kill-interval 300'
  4839. script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files
  4840. 915 --debug
  4841. variations:
  4842. - __suffix__: aws
  4843. - __suffix__: gce
  4844. env: gce
  4845. frequency: manual
  4846. cluster:
  4847. cluster_env: dataset/pipelined_ingestion_app.yaml
  4848. cluster_compute: dataset/pipelined_ingestion_compute_gce.yaml
  4849. - name: chaos_dataset_shuffle_push_based_sort_1tb
  4850. group: data-tests
  4851. working_dir: nightly_tests
  4852. stable: false
  4853. frequency: nightly
  4854. team: data
  4855. cluster:
  4856. cluster_env: shuffle/shuffle_app_config.yaml
  4857. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4858. run:
  4859. timeout: 7200
  4860. prepare: ' python setup_chaos.py --node-kill-interval 1200 --max-nodes-to-kill 3'
  4861. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4862. wait_for_nodes:
  4863. num_nodes: 20
  4864. variations:
  4865. - __suffix__: aws
  4866. - __suffix__: gce
  4867. env: gce
  4868. frequency: manual
  4869. cluster:
  4870. cluster_env: shuffle/shuffle_app_config.yaml
  4871. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4872. - name: chaos_dataset_shuffle_sort_1tb
  4873. group: data-tests
  4874. working_dir: nightly_tests
  4875. stable: false
  4876. frequency: nightly
  4877. team: data
  4878. cluster:
  4879. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4880. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4881. run:
  4882. timeout: 7200
  4883. prepare: 'python setup_chaos.py --node-kill-interval 900 --max-nodes-to-kill 3'
  4884. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4885. wait_for_nodes:
  4886. num_nodes: 20
  4887. variations:
  4888. - __suffix__: aws
  4889. - __suffix__: gce
  4890. env: gce
  4891. frequency: manual
  4892. cluster:
  4893. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4894. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4895. - name: chaos_dataset_shuffle_random_shuffle_1tb
  4896. group: data-tests
  4897. working_dir: nightly_tests
  4898. stable: false
  4899. frequency: nightly
  4900. team: data
  4901. cluster:
  4902. # leave oom disabled as test is marked unstable at the moment.
  4903. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4904. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4905. run:
  4906. timeout: 7200
  4907. prepare: ' python setup_chaos.py --node-kill-interval 600 --max-nodes-to-kill 2'
  4908. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4909. wait_for_nodes:
  4910. num_nodes: 20
  4911. variations:
  4912. - __suffix__: aws
  4913. - __suffix__: gce
  4914. env: gce
  4915. frequency: manual
  4916. cluster:
  4917. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4918. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4919. - name: chaos_dataset_shuffle_push_based_random_shuffle_1tb
  4920. group: data-tests
  4921. working_dir: nightly_tests
  4922. stable: false
  4923. frequency: nightly
  4924. team: data
  4925. cluster:
  4926. # leave oom disabled as test is marked unstable at the moment.
  4927. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4928. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4929. run:
  4930. timeout: 7200
  4931. prepare: ' python setup_chaos.py --node-kill-interval 600 --max-nodes-to-kill 2'
  4932. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4933. wait_for_nodes:
  4934. num_nodes: 20
  4935. variations:
  4936. - __suffix__: aws
  4937. - __suffix__: gce
  4938. env: gce
  4939. frequency: manual
  4940. cluster:
  4941. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4942. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4943. #####################
  4944. # Observability tests
  4945. #####################
  4946. - name: agent_stress_test
  4947. group: core-observability-test
  4948. working_dir: dashboard
  4949. stable: true
  4950. frequency: nightly
  4951. team: core
  4952. cluster:
  4953. cluster_env: agent_stress_app_config.yaml
  4954. cluster_compute: agent_stress_compute.yaml
  4955. run:
  4956. timeout: 14400
  4957. script: python mem_check.py --working-dir .
  4958. variations:
  4959. - __suffix__: aws
  4960. - __suffix__: gce
  4961. env: gce
  4962. frequency: manual
  4963. cluster:
  4964. cluster_env: agent_stress_app_config.yaml
  4965. cluster_compute: agent_stress_compute_gce.yaml
  4966. - name: k8s_serve_ha_test
  4967. group: k8s-test
  4968. working_dir: k8s_tests
  4969. stable: false
  4970. frequency: nightly
  4971. team: serve
  4972. cluster:
  4973. cluster_env: app_config.yaml
  4974. cluster_compute: compute_tpl.yaml
  4975. run:
  4976. timeout: 28800 # 8h
  4977. prepare: bash prepare.sh
  4978. script: python run_gcs_ft_on_k8s.py
  4979. - name: aws_cluster_launcher
  4980. group: cluster-launcher-test
  4981. working_dir: ../python/ray/autoscaler/
  4982. stable: true
  4983. frequency: nightly
  4984. team: core
  4985. cluster:
  4986. cluster_env: aws/tests/aws_config.yaml
  4987. cluster_compute: aws/tests/aws_compute.yaml
  4988. run:
  4989. timeout: 1200
  4990. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml
  4991. - name: aws_cluster_launcher_minimal
  4992. group: cluster-launcher-test
  4993. working_dir: ../python/ray/autoscaler/
  4994. stable: true
  4995. frequency: nightly
  4996. team: core
  4997. cluster:
  4998. cluster_env: aws/tests/aws_config.yaml
  4999. cluster_compute: aws/tests/aws_compute.yaml
  5000. run:
  5001. timeout: 1200
  5002. script: python launch_and_verify_cluster.py aws/example-minimal.yaml
  5003. - name: aws_cluster_launcher_full
  5004. group: cluster-launcher-test
  5005. working_dir: ../python/ray/autoscaler/
  5006. stable: true
  5007. frequency: nightly
  5008. team: core
  5009. cluster:
  5010. cluster_env: aws/tests/aws_config.yaml
  5011. cluster_compute: aws/tests/aws_compute.yaml
  5012. run:
  5013. timeout: 1200
  5014. script: python launch_and_verify_cluster.py aws/example-full.yaml
  5015. - name: gcp_cluster_launcher_minimal
  5016. group: cluster-launcher-test
  5017. working_dir: ../python/ray/autoscaler/
  5018. stable: true
  5019. env: gce
  5020. frequency: nightly
  5021. team: core
  5022. cluster:
  5023. cluster_env: gcp/tests/gce_config.yaml
  5024. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  5025. run:
  5026. timeout: 1200
  5027. script: python launch_and_verify_cluster.py gcp/example-minimal.yaml
  5028. - name: gcp_cluster_launcher_full
  5029. group: cluster-launcher-test
  5030. working_dir: ../python/ray/autoscaler/
  5031. stable: true
  5032. env: gce
  5033. frequency: nightly
  5034. team: core
  5035. cluster:
  5036. cluster_env: gcp/tests/gce_config.yaml
  5037. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  5038. run:
  5039. timeout: 2400
  5040. script: python launch_and_verify_cluster.py gcp/example-full.yaml