release_tests.yaml 131 KB


  1. # Global release test configuration file.
  2. # All your release test configuration should go here. Adding release tests here
  3. # will automatically enable them in the Buildkite release testing schedules
  4. # (except they have frequency: manual).
  5. # Here is an example configuration for reference:
  6. #- name: example_test
  7. # # Tests with the same group will be grouped in the Buildkite UI
  8. # group: Example group
  9. # # Provide the working directory which will be uploaded to the cluster
  10. # working_dir: example_dir
  11. #
  12. # # How often to run the tests.
  13. # # One of [manual, any, multi, nightly, nightly-3x, weekly].
  14. # # Descriptions of each frequency (that's not immediately obvious):
  15. # # - manual: Not run on a schedule, but can be manually run through the buildkite UI.
  16. # # - nightly-3x: Run 3 times a week (Monday, Wednesday, Friday).
  17. # frequency: weekly
  18. # # Owning team. This field will be persisted to the database
  19. # team: ml
  20. #
  21. # # Python version. This optional field determines which Python version to run tests
  22. # # on. This must be a string!
  23. # python: "3.7"
  24. #
  25. # # Optional location of a bash setup script to run on the driver
  26. # # when setting up the local environment. Relative to working_dir
  27. # driver_setup: setup_driver.sh
  28. #
  29. # # Cluster information
  30. # cluster:
  31. # # Location of cluster env, relative to working_dir
  32. # cluster_env: cluster_env.yaml
  33. # # Location of cluster compute, relative to working_dir
  34. # cluster_compute: cluster_compute.yaml
  35. # # Autosuspend parameter passed to the cluster.
  36. # # The cluster will automatically terminate if inactive for this
  37. # # many minutes. Defaults to 10 if not set.
  38. # autosuspend_mins: 10
  39. # # Optional cloud_id to use instead of the default cloud
  40. # cloud_id: cld_12345678
  41. # # Alternatively, you can specify a cloud name
  42. # cloud_name: anyscale_default_cloud
  43. #
  44. # # Run configuration for the test
  45. # run:
  46. # # Type of test. Can be [anyscale_job]
  47. # # Uses either Ray jobs, anyscale jobs or anyscale SDK commands
  48. # # run the actual release test.
  49. # type: anyscale_job
  50. #
  51. # # If you want to wait for nodes to be ready, you can specify this here:
  52. # wait_for_nodes:
  53. # # Number of nodes
  54. # num_nodes: 16
  55. # # Timeout for waiting for nodes. If nodes are not up by then, the
  56. # # test will fail.
  57. # timeout: 600
  58. #
  59. # # Optional prepare script to be run on the cluster before the test script
  60. # prepare: python prepare.py
  61. # # The prepare command can have a separate timeout
  62. # prepare_timeout: 300
  63. #
  64. # # Main script to run as the test script
  65. # script: python workloads/train_small.py
  66. # # Timeout in seconds. After this time the test is considered as failed.
  67. # timeout: 600
  68. #
  69. # # You can specify smoke test definitions here. If a smoke test is triggered,
  70. # # it will deep update the main test configuration with the values provided
  71. # # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
  72. # # environment variable and receive the --smoke-test flag as a parameter in the
  73. # # run script.
  74. # smoke_test:
  75. # # Smoke tests can have different frequencies. A smoke test is only triggered
  76. # # when the regular test is not matched.
  77. # frequency: nightly
  78. # # Here we adjust the run timeout down and run on less nodes. The test script
  79. # # remains the same.
  80. # run:
  81. # timeout: 300
  82. # wait_for_nodes:
  83. # num_nodes: 4
  84. # timeout: 600
  85. #
  86. # # After the test finished, this handler (in alerts/) will process the results.
  87. # # It can then let the test fail, e.g. if a metric regression is observed.
  88. # alert: default
  89. #######################
  90. # Cluster scaling tests
  91. #######################
  92. - name: cluster_tune_scale_up_down
  93. group: Cluster tests
  94. working_dir: cluster_tests
  95. frequency: nightly
  96. team: ml
  97. cluster:
  98. cluster_env: app_config.yaml
  99. cluster_compute: cpt_autoscaling_1-3_aws.yaml
  100. run:
  101. timeout: 3600
  102. script: python workloads/tune_scale_up_down.py
  103. wait_for_nodes:
  104. num_nodes: 0
  105. variations:
  106. - __suffix__: aws
  107. - __suffix__: gce
  108. env: gce
  109. frequency: manual
  110. cluster:
  111. cluster_env: app_config.yaml
  112. cluster_compute: cpt_autoscaling_1-3_gce.yaml
  113. alert: default
  114. #########################
  115. # AIR release tests
  116. #########################
  117. - name: tune_with_frequent_pausing
  118. group: AIR tests
  119. working_dir: air_tests
  120. frequency: nightly-3x
  121. team: ml
  122. cluster:
  123. cluster_env: frequent_pausing/app_config.yaml
  124. cluster_compute: frequent_pausing/compute_config_aws.yaml
  125. run:
  126. timeout: 600 # 10min
  127. long_running: true
  128. script: python frequent_pausing/script.py
  129. variations:
  130. - __suffix__: aws
  131. - __suffix__: gce
  132. env: gce
  133. frequency: manual
  134. cluster:
  135. cluster_env: frequent_pausing/app_config.yaml
  136. cluster_compute: frequent_pausing/compute_config_gce.yaml
  137. alert: default
  138. - name: long_running_horovod_tune_test
  139. group: AIR tests
  140. working_dir: air_tests
  141. frequency: weekly
  142. team: ml
  143. cluster:
  144. cluster_env: horovod/app_config_master.yaml
  145. cluster_compute: horovod/compute_tpl_aws.yaml
  146. variations:
  147. - __suffix__: aws
  148. - __suffix__: gce
  149. env: gce
  150. frequency: manual
  151. cluster:
  152. cluster_env: horovod/app_config_master.yaml
  153. cluster_compute: horovod/compute_tpl_gce.yaml
  154. run:
  155. timeout: 36000
  156. script: python horovod/workloads/horovod_tune_test.py
  157. long_running: true
  158. wait_for_nodes:
  159. num_nodes: 2
  160. smoke_test:
  161. frequency: manual
  162. run:
  163. timeout: 3600
  164. alert: default
  165. - name: air_benchmark_data_bulk_ingest
  166. group: AIR tests
  167. working_dir: air_tests/air_benchmarks
  168. frequency: nightly
  169. team: ml
  170. cluster:
  171. cluster_env: app_config.yaml
  172. cluster_compute: compute_data_20_nodes_aws.yaml
  173. run:
  174. timeout: 3600
  175. script: python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20
  176. wait_for_nodes:
  177. num_nodes: 20
  178. variations:
  179. - __suffix__: aws
  180. - __suffix__: gce
  181. env: gce
  182. frequency: manual
  183. cluster:
  184. cluster_env: app_config.yaml
  185. cluster_compute: compute_data_20_nodes_gce.yaml
  186. alert: default
  187. # AIR benchmarks for XGBoost CUJ
  188. - name: air_benchmark_xgboost_cpu_10
  189. group: AIR tests
  190. working_dir: air_tests/air_benchmarks
  191. frequency: nightly
  192. team: ml
  193. cluster:
  194. cluster_env: xgboost_app_config.yaml
  195. cluster_compute: compute_xgboost_aws.yaml
  196. run:
  197. timeout: 36000
  198. script: python workloads/xgboost_benchmark.py
  199. wait_for_nodes:
  200. num_nodes: 11
  201. variations:
  202. - __suffix__: aws
  203. - __suffix__: gce
  204. env: gce
  205. frequency: manual
  206. cluster:
  207. cluster_env: xgboost_app_config.yaml
  208. cluster_compute: compute_xgboost_gce.yaml
  209. smoke_test:
  210. frequency: manual
  211. run:
  212. timeout: 1800
  213. alert: default
  214. # Ray AIR distributed Torch benchmarks
  215. - name: air_benchmark_torch_mnist_cpu_4x1
  216. group: AIR tests
  217. working_dir: air_tests/air_benchmarks
  218. frequency: nightly
  219. team: ml
  220. cluster:
  221. cluster_env: app_config.yaml
  222. cluster_compute: compute_cpu_4_aws.yaml
  223. run:
  224. timeout: 3600
  225. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  226. wait_for_nodes:
  227. num_nodes: 4
  228. variations:
  229. - __suffix__: aws
  230. - __suffix__: gce
  231. env: gce
  232. frequency: manual
  233. cluster:
  234. cluster_env: app_config.yaml
  235. cluster_compute: compute_cpu_4_gce.yaml
  236. alert: default
  237. - name: air_benchmark_torch_mnist_gpu_4x4
  238. group: AIR tests
  239. working_dir: air_tests/air_benchmarks
  240. frequency: weekly
  241. team: ml
  242. cluster:
  243. cluster_env: app_config.yaml
  244. cluster_compute: compute_gpu_4x4_aws.yaml
  245. run:
  246. timeout: 4800
  247. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 120 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  248. wait_for_nodes:
  249. num_nodes: 4
  250. smoke_test:
  251. frequency: nightly
  252. cluster:
  253. cluster_compute: compute_gpu_2x2_aws.yaml
  254. run:
  255. timeout: 3600
  256. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  257. wait_for_nodes:
  258. num_nodes: 2
  259. variations:
  260. - __suffix__: aws
  261. - __suffix__: gce
  262. env: gce
  263. frequency: manual
  264. cluster:
  265. cluster_env: app_config.yaml
  266. cluster_compute: compute_gpu_4x4_gce.yaml
  267. smoke_test:
  268. frequency: manual
  269. alert: default
  270. - name: air_benchmark_torch_mnist_cpu_1x4
  271. group: AIR tests
  272. working_dir: air_tests/air_benchmarks
  273. frequency: nightly
  274. team: ml
  275. cluster:
  276. cluster_env: app_config.yaml
  277. cluster_compute: compute_cpu_1_aws.yaml
  278. run:
  279. timeout: 3600
  280. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  281. variations:
  282. - __suffix__: aws
  283. - __suffix__: gce
  284. env: gce
  285. frequency: manual
  286. cluster:
  287. cluster_env: app_config.yaml
  288. cluster_compute: compute_cpu_1_gce.yaml
  289. alert: default
  290. - name: air_benchmark_torch_batch_prediction_gpu_1x1_20gb
  291. group: AIR tests
  292. working_dir: air_tests/air_benchmarks
  293. frequency: nightly
  294. team: ml
  295. cluster:
  296. cluster_env: app_config.yaml
  297. cluster_compute: compute_gpu_1_cpu_16_aws.yaml
  298. run:
  299. timeout: 3600
  300. script: python workloads/gpu_batch_prediction.py --data-size-gb 20
  301. alert: default
  302. variations:
  303. - __suffix__: aws
  304. - __suffix__: gce
  305. env: gce
  306. frequency: manual
  307. cluster:
  308. cluster_env: app_config.yaml
  309. cluster_compute: compute_gpu_1_cpu_16_gce.yaml
  310. - name: air_benchmark_torch_batch_prediction_gpu_4x4_100gb
  311. group: AIR tests
  312. working_dir: air_tests/air_benchmarks
  313. frequency: nightly
  314. team: ml
  315. stable: false
  316. cluster:
  317. cluster_env: app_config.yaml
  318. cluster_compute: compute_gpu_4x4_aws.yaml
  319. run:
  320. timeout: 10800
  321. script: python workloads/gpu_batch_prediction.py --data-size-gb 100
  322. wait_for_nodes:
  323. num_nodes: 4
  324. alert: default
  325. variations:
  326. - __suffix__: aws
  327. - __suffix__: gce
  328. env: gce
  329. frequency: manual
  330. cluster:
  331. cluster_env: app_config.yaml
  332. cluster_compute: compute_gpu_4x4_gce.yaml
  333. - name: air_benchmark_torch_mnist_cpu_4x4
  334. group: AIR tests
  335. working_dir: air_tests/air_benchmarks
  336. frequency: nightly
  337. team: ml
  338. cluster:
  339. cluster_env: app_config.yaml
  340. cluster_compute: compute_cpu_4_aws.yaml
  341. run:
  342. timeout: 5400
  343. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  344. wait_for_nodes:
  345. num_nodes: 4
  346. variations:
  347. - __suffix__: aws
  348. - __suffix__: gce
  349. env: gce
  350. frequency: manual
  351. cluster:
  352. cluster_env: app_config.yaml
  353. cluster_compute: compute_cpu_4_gce.yaml
  354. alert: default
  355. - name: air_benchmark_tune_torch_mnist
  356. group: AIR tests
  357. working_dir: air_tests/air_benchmarks
  358. frequency: nightly
  359. team: ml
  360. cluster:
  361. cluster_env: app_config.yaml
  362. cluster_compute: compute_cpu_8_aws.yaml
  363. run:
  364. timeout: 3600
  365. script: python workloads/tune_torch_benchmark.py --num-runs 3 --num-trials 8 --num-workers 4
  366. wait_for_nodes:
  367. num_nodes: 8
  368. variations:
  369. - __suffix__: aws
  370. - __suffix__: gce
  371. env: gce
  372. frequency: manual
  373. cluster:
  374. cluster_env: app_config.yaml
  375. cluster_compute: compute_cpu_8_gce.yaml
  376. alert: default
  377. - name: air_benchmark_tune_torch_mnist_gpu
  378. group: AIR tests
  379. working_dir: air_tests/air_benchmarks
  380. frequency: nightly
  381. team: ml
  382. cluster:
  383. cluster_env: app_config.yaml
  384. cluster_compute: compute_gpu_4x4_aws.yaml
  385. run:
  386. timeout: 3600
  387. script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 4 --use-gpu
  388. wait_for_nodes:
  389. num_nodes: 4
  390. variations:
  391. - __suffix__: aws
  392. - __suffix__: gce
  393. env: gce
  394. frequency: manual
  395. cluster:
  396. cluster_env: app_config.yaml
  397. cluster_compute: compute_gpu_4x4_gce.yaml
  398. alert: default
  399. # Ray AIR distributed Tensorflow benchmarks
  400. - name: air_benchmark_tensorflow_mnist_cpu_4x1
  401. group: AIR tests
  402. working_dir: air_tests/air_benchmarks
  403. frequency: nightly
  404. team: ml
  405. cluster:
  406. cluster_env: app_config.yaml
  407. cluster_compute: compute_cpu_4_aws.yaml
  408. run:
  409. timeout: 5400
  410. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  411. wait_for_nodes:
  412. num_nodes: 4
  413. variations:
  414. - __suffix__: aws
  415. - __suffix__: gce
  416. env: gce
  417. frequency: manual
  418. cluster:
  419. cluster_env: app_config.yaml
  420. cluster_compute: compute_cpu_4_gce.yaml
  421. alert: default
  422. - name: air_benchmark_tensorflow_mnist_cpu_1x4
  423. group: AIR tests
  424. working_dir: air_tests/air_benchmarks
  425. frequency: nightly
  426. team: ml
  427. cluster:
  428. cluster_env: app_config.yaml
  429. cluster_compute: compute_cpu_1_aws.yaml
  430. run:
  431. timeout: 5400
  432. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  433. variations:
  434. - __suffix__: aws
  435. - __suffix__: gce
  436. env: gce
  437. frequency: manual
  438. cluster:
  439. cluster_env: app_config.yaml
  440. cluster_compute: compute_cpu_1_gce.yaml
  441. alert: default
  442. - name: air_benchmark_tensorflow_mnist_cpu_4x4
  443. group: AIR tests
  444. working_dir: air_tests/air_benchmarks
  445. frequency: nightly
  446. team: ml
  447. stable: false
  448. cluster:
  449. cluster_env: app_config.yaml
  450. cluster_compute: compute_cpu_4_aws.yaml
  451. run:
  452. timeout: 5400
  453. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  454. wait_for_nodes:
  455. num_nodes: 4
  456. variations:
  457. - __suffix__: aws
  458. - __suffix__: gce
  459. env: gce
  460. frequency: manual
  461. cluster:
  462. cluster_env: app_config.yaml
  463. cluster_compute: compute_cpu_4_gce.yaml
  464. alert: default
  465. - name: air_benchmark_tensorflow_mnist_gpu_4x4
  466. group: AIR tests
  467. working_dir: air_tests/air_benchmarks
  468. frequency: weekly
  469. team: ml
  470. stable: false
  471. cluster:
  472. cluster_env: app_config.yaml
  473. cluster_compute: compute_gpu_4x4_aws.yaml
  474. run:
  475. timeout: 5400
  476. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  477. wait_for_nodes:
  478. num_nodes: 4
  479. smoke_test:
  480. frequency: nightly
  481. cluster:
  482. cluster_compute: compute_gpu_2x2_aws.yaml
  483. run:
  484. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  485. wait_for_nodes:
  486. num_nodes: 2
  487. variations:
  488. - __suffix__: aws
  489. - __suffix__: gce
  490. env: gce
  491. frequency: manual
  492. cluster:
  493. cluster_env: app_config.yaml
  494. cluster_compute: compute_gpu_4x4_gce.yaml
  495. smoke_test:
  496. frequency: manual
  497. alert: default
  498. - name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
  499. group: AIR tests
  500. working_dir: air_tests/air_benchmarks
  501. frequency: nightly
  502. team: ml
  503. cluster:
  504. cluster_env: app_config.yaml
  505. cluster_compute: compute_gpu_1_aws.yaml
  506. run:
  507. timeout: 3600
  508. script: python workloads/pytorch_training_e2e.py --data-size-gb 20
  509. alert: default
  510. variations:
  511. - __suffix__: aws
  512. - __suffix__: gce
  513. env: gce
  514. frequency: manual
  515. cluster:
  516. cluster_env: app_config.yaml
  517. cluster_compute: compute_gpu_1_gce.yaml
  518. - name: air_benchmark_pytorch_training_e2e_gpu_4x4_100gb
  519. group: AIR tests
  520. working_dir: air_tests/air_benchmarks
  521. frequency: nightly
  522. team: ml
  523. stable: false
  524. cluster:
  525. cluster_env: app_config.yaml
  526. cluster_compute: compute_gpu_4x4_aws.yaml
  527. run:
  528. timeout: 10800
  529. script: python workloads/pytorch_training_e2e.py --data-size-gb=100 --num-workers=16
  530. wait_for_nodes:
  531. num_nodes: 4
  532. alert: default
  533. variations:
  534. - __suffix__: aws
  535. - __suffix__: gce
  536. env: gce
  537. frequency: manual
  538. cluster:
  539. cluster_env: app_config.yaml
  540. cluster_compute: compute_gpu_4x4_gce.yaml
  541. # Test tiny, medium, and huge input files.
  542. - name: ray-data-bulk-ingest-file-size-benchmark
  543. group: AIR tests
  544. working_dir: air_tests/air_benchmarks/mlperf-train
  545. stable: false
  546. jailed: true
  547. frequency: nightly
  548. team: core
  549. cluster:
  550. cluster_env: app_config_oom.yaml
  551. cluster_compute: compute_cpu_16.yaml
  552. run:
  553. timeout: 3600
  554. script: bash file_size_benchmark.sh
  555. variations:
  556. - __suffix__: aws
  557. - __suffix__: gce
  558. env: gce
  559. frequency: manual
  560. cluster:
  561. cluster_env: app_config_oom.yaml
  562. cluster_compute: compute_gce_cpu_16.yaml
  563. # Test dataset larger than object store memory.
  564. - name: ray-data-bulk-ingest-out-of-core-benchmark
  565. group: AIR tests
  566. working_dir: air_tests/air_benchmarks/mlperf-train
  567. stable: false
  568. jailed: true
  569. frequency: nightly
  570. team: core
  571. cluster:
  572. cluster_env: app_config_oom.yaml
  573. cluster_compute: compute_cpu_16.yaml
  574. run:
  575. timeout: 3600
  576. script: bash out_of_core_benchmark.sh
  577. variations:
  578. - __suffix__: aws
  579. - __suffix__: gce
  580. env: gce
  581. frequency: manual
  582. cluster:
  583. cluster_env: app_config_oom.yaml
  584. cluster_compute: compute_gce_cpu_16.yaml
  585. # Test additional CPU nodes for preprocessing.
  586. - name: ray-data-bulk-ingest-heterogeneity-benchmark
  587. group: AIR tests
  588. working_dir: air_tests/air_benchmarks/mlperf-train
  589. stable: false
  590. jailed: true
  591. frequency: nightly
  592. team: core
  593. cluster:
  594. cluster_env: app_config_oom.yaml
  595. cluster_compute: compute_cpu_16_worker_nodes_2.yaml
  596. run:
  597. wait_for_nodes:
  598. num_nodes: 3
  599. timeout: 1800
  600. script: bash heterogeneity_benchmark.sh 2
  601. variations:
  602. - __suffix__: aws
  603. - __suffix__: gce
  604. env: gce
  605. frequency: manual
  606. cluster:
  607. cluster_env: app_config_oom.yaml
  608. cluster_compute: compute_gce_cpu_16_worker_nodes_2.yaml
  609. #######################
  610. # AIR examples
  611. #######################
  612. # Test additional CPU nodes for preprocessing.
  613. - name: air_example_dreambooth_finetuning
  614. group: AIR examples
  615. working_dir: air_examples/dreambooth
  616. stable: false
  617. frequency: weekly
  618. team: ml
  619. cluster:
  620. cluster_env: dreambooth_env.yaml
  621. cluster_compute: dreambooth_compute_aws.yaml
  622. run:
  623. timeout: 1800
  624. script: bash dreambooth_run.sh
  625. artifact_path: /tmp/artifacts/example_out.jpg
  626. # variations: A10G not available on GCE, yet.
  627. - name: air_example_gptj_deepspeed_fine_tuning
  628. group: AIR examples
  629. working_dir: air_examples/gptj_deepspeed_finetuning
  630. python: "3.9"
  631. frequency: weekly
  632. team: ml
  633. cluster:
  634. cluster_env: gptj_deepspeed_env.yaml
  635. cluster_compute: gptj_deepspeed_compute_aws.yaml
  636. run:
  637. timeout: 3600
  638. script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb
  639. variations:
  640. - __suffix__: aws
  641. - __suffix__: gce
  642. env: gce
  643. frequency: manual
  644. cluster:
  645. cluster_env: gptj_deepspeed_env.yaml
  646. cluster_compute: gptj_deepspeed_compute_gce.yaml
  647. - name: air_example_opt_deepspeed_batch_inference
  648. group: AIR examples
  649. working_dir: air_examples/opt_deepspeed_batch_inference
  650. python: "3.9"
  651. frequency: weekly
  652. team: ml
  653. cluster:
  654. cluster_env: 30b_deepspeed_env.yaml
  655. cluster_compute: 30b_deepspeed_compute.yaml
  656. run:
  657. timeout: 3600
  658. script: python test_myst_doc.py --path opt_deepspeed_batch_inference.ipynb
  659. # variations: TODO(jungong): add GCP variation.
  660. #####################################
  661. # Workspace templates release tests #
  662. #####################################
  663. - name: workspace_template_batch_inference
  664. group: Workspace templates
  665. working_dir: workspace_templates/tests/01_batch_inference
  666. python: "3.9"
  667. frequency: nightly-3x
  668. team: ml
  669. cluster:
  670. cluster_env: ../../configs/release_test_cluster_env.yaml
  671. cluster_compute: ../../configs/compute/gpu/aws_release_test.yaml
  672. run:
  673. timeout: 600
  674. script: jupyter nbconvert --to script --output _test batch_inference.ipynb && ipython _test.py
  675. variations:
  676. - __suffix__: aws
  677. - __suffix__: gce
  678. env: gce
  679. frequency: manual
  680. cluster:
  681. cluster_env: ../../configs/release_test_cluster_env.yaml
  682. cluster_compute: ../../configs/compute/gpu/gce_release_test.yaml
  683. - name: workspace_template_many_model_training
  684. group: Workspace templates
  685. working_dir: workspace_templates/tests/02_many_model_training
  686. python: "3.9"
  687. frequency: nightly-3x
  688. team: ml
  689. cluster:
  690. cluster_env: ../../configs/release_test_cluster_env.yaml
  691. cluster_compute: ../../configs/compute/cpu/aws_release_test.yaml
  692. run:
  693. timeout: 600
  694. script: pip install -U -r requirements.txt && jupyter nbconvert --to script --output _test many_model_training.ipynb && ipython _test.py
  695. variations:
  696. - __suffix__: aws
  697. - __suffix__: gce
  698. env: gce
  699. frequency: manual
  700. cluster:
  701. cluster_env: ../../configs/release_test_cluster_env.yaml
  702. cluster_compute: ../../configs/compute/cpu/gce_release_test.yaml
  703. - name: workspace_template_serving_stable_diffusion
  704. group: Workspace templates
  705. working_dir: workspace_templates/tests/03_serving_stable_diffusion
  706. python: "3.9"
  707. frequency: nightly-3x
  708. team: ml
  709. cluster:
  710. cluster_env: ../../configs/release_test_cluster_env.yaml
  711. cluster_compute: ../../configs/compute/gpu/aws_release_test.yaml
  712. run:
  713. timeout: 600
  714. script: pip install -U -r requirements.txt && jupyter nbconvert --to script --output _test serving_stable_diffusion.ipynb && ipython _test.py
  715. variations:
  716. - __suffix__: aws
  717. - __suffix__: gce
  718. env: gce
  719. frequency: manual
  720. cluster:
  721. cluster_env: ../../configs/release_test_cluster_env.yaml
  722. cluster_compute: ../../configs/compute/gpu/gce_release_test.yaml
  723. #######################
  724. # XGBoost release tests
  725. #######################
  726. # It seems like the consensus is that we can deprecate this test.
  727. # - name: xgboost_train_small
  728. # group: XGBoost
  729. # working_dir: xgboost_tests
  730. # frequency: nightly
  731. # team: ml
  732. # env: staging_v2
  733. # cluster:
  734. # cluster_env: app_config.yaml
  735. # cluster_compute: tpl_cpu_small.yaml
  736. # run:
  737. # timeout: 600
  738. # script: python workloads/train_small.py
  739. # wait_for_nodes:
  740. # num_nodes: 4
  741. # type: anyscale_job
  742. # alert: xgboost_tests
  743. - name: xgboost_train_moderate
  744. group: XGBoost
  745. working_dir: xgboost_tests
  746. frequency: nightly
  747. team: ml
  748. cluster:
  749. cluster_env: app_config.yaml
  750. cluster_compute: tpl_cpu_moderate_aws.yaml
  751. run:
  752. timeout: 600
  753. script: python workloads/train_moderate.py
  754. wait_for_nodes:
  755. num_nodes: 32
  756. variations:
  757. - __suffix__: aws
  758. - __suffix__: gce
  759. env: gce
  760. frequency: manual
  761. cluster:
  762. cluster_env: app_config.yaml
  763. cluster_compute: tpl_cpu_moderate_gce.yaml
  764. alert: xgboost_tests
  765. - name: xgboost_train_gpu
  766. group: XGBoost
  767. working_dir: xgboost_tests
  768. frequency: nightly
  769. team: ml
  770. cluster:
  771. cluster_env: app_config_gpu.yaml
  772. cluster_compute: tpl_gpu_small_aws.yaml
  773. run:
  774. timeout: 600
  775. script: python workloads/train_gpu.py
  776. wait_for_nodes:
  777. num_nodes: 5
  778. variations:
  779. - __suffix__: aws
  780. - __suffix__: gce
  781. env: gce
  782. frequency: manual
  783. cluster:
  784. cluster_env: app_config_gpu.yaml
  785. cluster_compute: tpl_gpu_small_gce.yaml
  786. alert: xgboost_tests
  787. - name: xgboost_distributed_api_test
  788. group: XGBoost
  789. working_dir: xgboost_tests
  790. frequency: nightly
  791. team: ml
  792. cluster:
  793. cluster_env: app_config.yaml
  794. cluster_compute: tpl_cpu_small_aws.yaml
  795. run:
  796. timeout: 600
  797. script: python workloads/distributed_api_test.py
  798. wait_for_nodes:
  799. num_nodes: 4
  800. variations:
  801. - __suffix__: aws
  802. - __suffix__: gce
  803. env: gce
  804. frequency: manual
  805. cluster:
  806. cluster_env: app_config.yaml
  807. cluster_compute: tpl_cpu_small_gce.yaml
  808. alert: default
  809. - name: xgboost_ft_small_elastic
  810. group: XGBoost
  811. working_dir: xgboost_tests
  812. frequency: nightly
  813. team: ml
  814. cluster:
  815. cluster_env: app_config.yaml
  816. cluster_compute: tpl_cpu_small_aws.yaml
  817. run:
  818. timeout: 900
  819. script: python workloads/ft_small_elastic.py
  820. wait_for_nodes:
  821. num_nodes: 4
  822. variations:
  823. - __suffix__: aws
  824. - __suffix__: gce
  825. env: gce
  826. frequency: manual
  827. cluster:
  828. cluster_env: app_config.yaml
  829. cluster_compute: tpl_cpu_small_gce.yaml
  830. alert: default
  831. - name: xgboost_ft_small_non_elastic
  832. group: XGBoost
  833. working_dir: xgboost_tests
  834. frequency: nightly
  835. team: ml
  836. cluster:
  837. cluster_env: app_config.yaml
  838. cluster_compute: tpl_cpu_small_aws.yaml
  839. run:
  840. timeout: 900
  841. script: python workloads/ft_small_non_elastic.py
  842. wait_for_nodes:
  843. num_nodes: 4
  844. variations:
  845. - __suffix__: aws
  846. - __suffix__: gce
  847. env: gce
  848. frequency: manual
  849. cluster:
  850. cluster_env: app_config.yaml
  851. cluster_compute: tpl_cpu_small_gce.yaml
  852. alert: default
  853. - name: xgboost_tune_small
  854. group: XGBoost
  855. working_dir: xgboost_tests
  856. frequency: nightly
  857. team: ml
  858. cluster:
  859. cluster_env: app_config.yaml
  860. cluster_compute: tpl_cpu_small_aws.yaml
  861. run:
  862. timeout: 600
  863. script: python workloads/tune_small.py
  864. wait_for_nodes:
  865. num_nodes: 4
  866. variations:
  867. - __suffix__: aws
  868. - __suffix__: gce
  869. env: gce
  870. frequency: manual
  871. cluster:
  872. cluster_env: app_config.yaml
  873. cluster_compute: tpl_cpu_small_gce.yaml
  874. alert: xgboost_tests
  875. - name: xgboost_tune_32x4
  876. group: XGBoost
  877. working_dir: xgboost_tests
  878. frequency: nightly
  879. team: ml
  880. cluster:
  881. cluster_env: app_config.yaml
  882. cluster_compute: tpl_cpu_moderate_aws.yaml
  883. run:
  884. timeout: 900
  885. script: python workloads/tune_32x4.py
  886. wait_for_nodes:
  887. num_nodes: 32
  888. variations:
  889. - __suffix__: aws
  890. - __suffix__: gce
  891. env: gce
  892. frequency: manual
  893. cluster:
  894. cluster_env: app_config.yaml
  895. cluster_compute: tpl_cpu_moderate_gce.yaml
  896. alert: xgboost_tests
  897. - name: xgboost_tune_4x32
  898. group: XGBoost
  899. working_dir: xgboost_tests
  900. frequency: nightly
  901. team: ml
  902. cluster:
  903. cluster_env: app_config.yaml
  904. cluster_compute: tpl_cpu_moderate_aws.yaml
  905. run:
  906. timeout: 900
  907. script: python workloads/tune_4x32.py
  908. wait_for_nodes:
  909. num_nodes: 32
  910. variations:
  911. - __suffix__: aws
  912. - __suffix__: gce
  913. env: gce
  914. frequency: manual
  915. cluster:
  916. cluster_env: app_config.yaml
  917. cluster_compute: tpl_cpu_moderate_gce.yaml
  918. alert: xgboost_tests
  919. #######################
  920. # LightGBM tests
  921. #######################
  922. # It seems like the consensus is that we can deprecate this test.
  923. # - name: lightgbm_train_small
  924. # group: LightGBM tests
  925. # working_dir: lightgbm_tests
  926. # frequency: nightly
  927. # team: ml
  928. # env: staging_v2
  929. # cluster:
  930. # cluster_env: app_config.yaml
  931. # cluster_compute: tpl_cpu_small.yaml
  932. # run:
  933. # timeout: 600
  934. # script: python workloads/train_small.py
  935. # wait_for_nodes:
  936. # num_nodes: 4
  937. # type: anyscale_job
  938. # alert: default
  939. - name: lightgbm_train_moderate
  940. group: LightGBM tests
  941. working_dir: lightgbm_tests
  942. frequency: nightly
  943. team: ml
  944. cluster:
  945. cluster_env: app_config.yaml
  946. cluster_compute: tpl_cpu_moderate_aws.yaml
  947. run:
  948. timeout: 600
  949. script: python workloads/train_moderate.py
  950. wait_for_nodes:
  951. num_nodes: 32
  952. variations:
  953. - __suffix__: aws
  954. - __suffix__: gce
  955. env: gce
  956. frequency: manual
  957. cluster:
  958. cluster_env: app_config.yaml
  959. cluster_compute: tpl_cpu_moderate_gce.yaml
  960. alert: default
  961. - name: lightgbm_distributed_api_test
  962. group: LightGBM tests
  963. working_dir: lightgbm_tests
  964. frequency: nightly
  965. team: ml
  966. cluster:
  967. cluster_env: app_config.yaml
  968. cluster_compute: tpl_cpu_small_aws.yaml
  969. run:
  970. timeout: 600
  971. script: python workloads/distributed_api_test.py
  972. wait_for_nodes:
  973. num_nodes: 4
  974. variations:
  975. - __suffix__: aws
  976. - __suffix__: gce
  977. env: gce
  978. frequency: manual
  979. cluster:
  980. cluster_env: app_config.yaml
  981. cluster_compute: tpl_cpu_small_gce.yaml
  982. alert: default
  983. - name: lightgbm_ft_small_non_elastic
  984. group: LightGBM tests
  985. working_dir: lightgbm_tests
  986. frequency: nightly
  987. team: ml
  988. cluster:
  989. cluster_env: app_config.yaml
  990. cluster_compute: tpl_cpu_small_aws.yaml
  991. run:
  992. timeout: 900
  993. script: python workloads/ft_small_non_elastic.py
  994. wait_for_nodes:
  995. num_nodes: 4
  996. variations:
  997. - __suffix__: aws
  998. - __suffix__: gce
  999. env: gce
  1000. frequency: manual
  1001. cluster:
  1002. cluster_env: app_config.yaml
  1003. cluster_compute: tpl_cpu_small_gce.yaml
  1004. alert: default
  1005. - name: lightgbm_tune_small
  1006. group: LightGBM tests
  1007. working_dir: lightgbm_tests
  1008. frequency: nightly
  1009. team: ml
  1010. cluster:
  1011. cluster_env: app_config.yaml
  1012. cluster_compute: tpl_cpu_small_aws.yaml
  1013. run:
  1014. timeout: 600
  1015. script: python workloads/tune_small.py
  1016. wait_for_nodes:
  1017. num_nodes: 4
  1018. variations:
  1019. - __suffix__: aws
  1020. - __suffix__: gce
  1021. env: gce
  1022. frequency: manual
  1023. cluster:
  1024. cluster_env: app_config.yaml
  1025. cluster_compute: tpl_cpu_small_gce.yaml
  1026. alert: default
  1027. - name: lightgbm_tune_16x4
  1028. group: LightGBM tests
  1029. working_dir: lightgbm_tests
  1030. frequency: nightly
  1031. team: ml
  1032. cluster:
  1033. cluster_env: app_config.yaml
  1034. cluster_compute: tpl_cpu_moderate_aws.yaml
  1035. run:
  1036. timeout: 900
  1037. script: python workloads/tune_16x4.py
  1038. wait_for_nodes:
  1039. num_nodes: 32
  1040. variations:
  1041. - __suffix__: aws
  1042. - __suffix__: gce
  1043. env: gce
  1044. frequency: manual
  1045. cluster:
  1046. cluster_env: app_config.yaml
  1047. cluster_compute: tpl_cpu_moderate_gce.yaml
  1048. alert: default
  1049. - name: lightgbm_tune_4x16
  1050. group: LightGBM tests
  1051. working_dir: lightgbm_tests
  1052. frequency: nightly
  1053. team: ml
  1054. cluster:
  1055. cluster_env: app_config.yaml
  1056. cluster_compute: tpl_cpu_moderate_aws.yaml
  1057. run:
  1058. timeout: 900
  1059. script: python workloads/tune_4x16.py
  1060. wait_for_nodes:
  1061. num_nodes: 32
  1062. variations:
  1063. - __suffix__: aws
  1064. - __suffix__: gce
  1065. env: gce
  1066. frequency: manual
  1067. cluster:
  1068. cluster_env: app_config.yaml
  1069. cluster_compute: tpl_cpu_moderate_gce.yaml
  1070. alert: default
  1071. #######################
  1072. # Lightning tests
  1073. #######################
  1074. # Naming convention: lightning_{accelerator}_{mode}_{#cpu}_{#gpu}
  1075. - name: lightning_gpu_train_3x16_3x1
  1076. group: Lightning tests
  1077. working_dir: lightning_tests
  1078. frequency: nightly-3x
  1079. team: ml
  1080. cluster:
  1081. cluster_env: app_config.yaml
  1082. cluster_compute: compute_tpl_aws.yaml
  1083. run:
  1084. timeout: 1200
  1085. script: python workloads/test_trainer.py
  1086. wait_for_nodes:
  1087. num_nodes: 3
  1088. variations:
  1089. - __suffix__: aws
  1090. - __suffix__: gce
  1091. env: gce
  1092. frequency: manual
  1093. cluster:
  1094. cluster_env: app_config.yaml
  1095. cluster_compute: compute_tpl_gce.yaml
  1096. alert: default
  1097. - name: lightning_gpu_tune_3x16_3x1
  1098. group: Lightning tests
  1099. working_dir: lightning_tests
  1100. frequency: nightly-3x
  1101. team: ml
  1102. cluster:
  1103. cluster_env: app_config.yaml
  1104. cluster_compute: compute_tpl_aws.yaml
  1105. run:
  1106. timeout: 1200
  1107. script: python workloads/test_tuner.py
  1108. wait_for_nodes:
  1109. num_nodes: 3
  1110. variations:
  1111. - __suffix__: aws
  1112. - __suffix__: gce
  1113. env: gce
  1114. frequency: manual
  1115. cluster:
  1116. cluster_env: app_config.yaml
  1117. cluster_compute: compute_tpl_gce.yaml
  1118. alert: default
  1119. #######################
  1120. # ML user tests
  1121. #######################
  1122. - name: ml_user_horovod_user_test_latest
  1123. group: ML user tests
  1124. working_dir: ml_user_tests
  1125. frequency: nightly-3x
  1126. team: ml
  1127. cluster:
  1128. cluster_env: horovod/app_config.yaml
  1129. cluster_compute: horovod/compute_tpl_aws.yaml
  1130. driver_setup: horovod/driver_setup_latest.sh
  1131. run:
  1132. timeout: 1200
  1133. script: python horovod/horovod_user_test.py
  1134. wait_for_nodes:
  1135. num_nodes: 4
  1136. variations:
  1137. - __suffix__: aws
  1138. - __suffix__: gce
  1139. env: gce
  1140. frequency: manual
  1141. cluster:
  1142. cluster_env: horovod/app_config.yaml
  1143. cluster_compute: horovod/compute_tpl_gce.yaml
  1144. alert: default
  1145. - name: ml_user_horovod_user_test_master
  1146. group: ML user tests
  1147. working_dir: ml_user_tests
  1148. frequency: nightly-3x
  1149. team: ml
  1150. cluster:
  1151. cluster_env: horovod/app_config_master.yaml
  1152. cluster_compute: horovod/compute_tpl_aws.yaml
  1153. driver_setup: horovod/driver_setup_master.sh
  1154. run:
  1155. timeout: 1200
  1156. script: python horovod/horovod_user_test.py
  1157. wait_for_nodes:
  1158. num_nodes: 4
  1159. variations:
  1160. - __suffix__: aws
  1161. - __suffix__: gce
  1162. env: gce
  1163. frequency: manual
  1164. cluster:
  1165. cluster_env: horovod/app_config_master.yaml
  1166. cluster_compute: horovod/compute_tpl_gce.yaml
  1167. alert: default
  1168. - name: ml_user_train_tensorflow_mnist_test
  1169. group: ML user tests
  1170. working_dir: ml_user_tests
  1171. frequency: nightly-3x
  1172. team: ml
  1173. cluster:
  1174. cluster_env: train/app_config.yaml
  1175. cluster_compute: train/compute_tpl_aws.yaml
  1176. driver_setup: train/driver_setup.sh
  1177. run:
  1178. timeout: 36000
  1179. script: python train/train_tensorflow_mnist_test.py
  1180. wait_for_nodes:
  1181. num_nodes: 3
  1182. variations:
  1183. - __suffix__: aws
  1184. - __suffix__: gce
  1185. env: gce
  1186. frequency: manual
  1187. cluster:
  1188. cluster_env: train/app_config.yaml
  1189. cluster_compute: train/compute_tpl_gce.yaml
  1190. alert: default
  1191. - name: ml_user_train_torch_linear_test
  1192. group: ML user tests
  1193. working_dir: ml_user_tests
  1194. frequency: nightly-3x
  1195. team: ml
  1196. cluster:
  1197. cluster_env: train/app_config.yaml
  1198. cluster_compute: train/compute_tpl_aws.yaml
  1199. driver_setup: train/driver_setup.sh
  1200. run:
  1201. timeout: 36000
  1202. script: python train/train_torch_linear_test.py
  1203. wait_for_nodes:
  1204. num_nodes: 3
  1205. variations:
  1206. - __suffix__: aws
  1207. - __suffix__: gce
  1208. env: gce
  1209. frequency: manual
  1210. cluster:
  1211. cluster_env: train/app_config.yaml
  1212. cluster_compute: train/compute_tpl_gce.yaml
  1213. alert: default
  1214. - name: ml_user_xgboost_gpu_connect_latest
  1215. group: ML user tests
  1216. working_dir: ml_user_tests
  1217. frequency: nightly-3x
  1218. team: ml
  1219. cluster:
  1220. cluster_env: xgboost/app_config_gpu.yaml
  1221. cluster_compute: xgboost/tpl_gpu_small_scaling_aws.yaml
  1222. run:
  1223. timeout: 1200
  1224. script: python xgboost/train_gpu_connect.py
  1225. wait_for_nodes:
  1226. num_nodes: 5
  1227. variations:
  1228. - __suffix__: aws
  1229. - __suffix__: gce
  1230. env: gce
  1231. frequency: manual
  1232. cluster:
  1233. cluster_env: xgboost/app_config_gpu.yaml
  1234. cluster_compute: xgboost/tpl_gpu_small_scaling_gce.yaml
  1235. alert: default
  1236. - name: ml_user_xgboost_gpu_connect_master
  1237. group: ML user tests
  1238. working_dir: ml_user_tests
  1239. frequency: nightly-3x
  1240. team: ml
  1241. cluster:
  1242. cluster_env: xgboost/app_config_gpu_master.yaml
  1243. cluster_compute: xgboost/tpl_gpu_small_scaling_aws.yaml
  1244. run:
  1245. timeout: 1200
  1246. script: python xgboost/train_gpu_connect.py
  1247. wait_for_nodes:
  1248. num_nodes: 5
  1249. variations:
  1250. - __suffix__: aws
  1251. - __suffix__: gce
  1252. env: gce
  1253. frequency: manual
  1254. cluster:
  1255. cluster_env: xgboost/app_config_gpu_master.yaml
  1256. cluster_compute: xgboost/tpl_gpu_small_scaling_gce.yaml
  1257. alert: default
  1258. - name: ml_user_ray_lightning_user_test_latest
  1259. group: ML user tests
  1260. working_dir: ml_user_tests
  1261. frequency: nightly-3x
  1262. team: ml
  1263. cluster:
  1264. cluster_env: ray-lightning/app_config.yaml
  1265. cluster_compute: ray-lightning/compute_tpl_aws.yaml
  1266. driver_setup: ray-lightning/driver_setup.sh
  1267. run:
  1268. timeout: 1200
  1269. script: python ray-lightning/ray_lightning_user_test.py
  1270. wait_for_nodes:
  1271. num_nodes: 3
  1272. variations:
  1273. - __suffix__: aws
  1274. - __suffix__: gce
  1275. env: gce
  1276. frequency: manual
  1277. cluster:
  1278. cluster_env: ray-lightning/app_config.yaml
  1279. cluster_compute: ray-lightning/compute_tpl_gce.yaml
  1280. alert: default
  1281. - name: ml_user_ray_lightning_user_test_master
  1282. group: ML user tests
  1283. working_dir: ml_user_tests
  1284. frequency: nightly-3x
  1285. team: ml
  1286. cluster:
  1287. cluster_env: ray-lightning/app_config_master.yaml
  1288. cluster_compute: ray-lightning/compute_tpl_aws.yaml
  1289. driver_setup: ray-lightning/driver_setup.sh
  1290. run:
  1291. timeout: 1200
  1292. script: python ray-lightning/ray_lightning_user_test.py
  1293. wait_for_nodes:
  1294. num_nodes: 3
  1295. variations:
  1296. - __suffix__: aws
  1297. - __suffix__: gce
  1298. env: gce
  1299. frequency: manual
  1300. cluster:
  1301. cluster_env: ray-lightning/app_config_master.yaml
  1302. cluster_compute: ray-lightning/compute_tpl_gce.yaml
  1303. alert: default
  1304. - name: ml_user_tune_rllib_connect_test
  1305. group: ML user tests
  1306. working_dir: ml_user_tests
  1307. frequency: nightly-3x
  1308. team: ml
  1309. cluster:
  1310. cluster_env: ../rllib_tests/app_config.yaml
  1311. cluster_compute: tune_rllib/compute_tpl_aws.yaml
  1312. driver_setup: tune_rllib/driver_setup.sh
  1313. run:
  1314. timeout: 2000
  1315. script: python tune_rllib/run_connect_tests.py
  1316. wait_for_nodes:
  1317. num_nodes: 9
  1318. variations:
  1319. - __suffix__: aws
  1320. - __suffix__: gce
  1321. env: gce
  1322. frequency: manual
  1323. cluster:
  1324. cluster_env: ../rllib_tests/app_config.yaml
  1325. cluster_compute: tune_rllib/compute_tpl_gce.yaml
  1326. alert: default
  1327. #######################
  1328. # Tune cloud tests
  1329. #######################
  1330. - name: tune_cloud_no_sync_down
  1331. group: Tune cloud tests
  1332. working_dir: tune_tests/cloud_tests
  1333. frequency: nightly
  1334. team: ml
  1335. cluster:
  1336. cluster_env: app_config.yaml
  1337. cluster_compute: tpl_aws_4x2.yaml
  1338. run:
  1339. timeout: 600
  1340. script: python workloads/run_cloud_test.py no_sync_down
  1341. wait_for_nodes:
  1342. num_nodes: 4
  1343. variations:
  1344. - __suffix__: aws
  1345. - __suffix__: gce
  1346. env: gce
  1347. frequency: manual
  1348. cluster:
  1349. cluster_compute: tpl_gce_4x8.yaml
  1350. alert: tune_tests
  1351. - name: tune_cloud_ssh_sync
  1352. group: Tune cloud tests
  1353. working_dir: tune_tests/cloud_tests
  1354. frequency: nightly
  1355. team: ml
  1356. cluster:
  1357. cluster_env: app_config.yaml
  1358. cluster_compute: tpl_aws_4x2.yaml
  1359. run:
  1360. timeout: 600
  1361. script: python workloads/run_cloud_test.py ssh_sync
  1362. wait_for_nodes:
  1363. num_nodes: 4
  1364. variations:
  1365. - __suffix__: aws
  1366. - __suffix__: gce
  1367. env: gce
  1368. frequency: manual
  1369. cluster:
  1370. cluster_env: app_config.yaml
  1371. cluster_compute: tpl_gce_4x8.yaml
  1372. alert: tune_tests
  1373. - name: tune_cloud_durable_upload
  1374. group: Tune cloud tests
  1375. working_dir: tune_tests/cloud_tests
  1376. frequency: nightly
  1377. team: ml
  1378. cluster:
  1379. cluster_env: app_config.yaml
  1380. cluster_compute: tpl_aws_4x2.yaml
  1381. run:
  1382. timeout: 600
  1383. script: python workloads/run_cloud_test.py durable_upload --bucket s3://tune-cloud-tests/durable_upload
  1384. wait_for_nodes:
  1385. num_nodes: 4
  1386. variations:
  1387. - __suffix__: aws
  1388. - __suffix__: gce
  1389. env: gce
  1390. frequency: manual
  1391. cluster:
  1392. cluster_env: app_config.yaml
  1393. cluster_compute: tpl_gce_4x8.yaml
  1394. run:
  1395. timeout: 600
  1396. script: python workloads/run_cloud_test.py durable_upload --bucket gs://tune-cloud-tests/durable_upload
  1397. wait_for_nodes:
  1398. num_nodes: 4
  1399. alert: tune_tests
  1400. - name: tune_cloud_durable_upload_rllib_str
  1401. group: Tune cloud tests
  1402. working_dir: tune_tests/cloud_tests
  1403. stable: false
  1404. frequency: nightly
  1405. team: ml
  1406. cluster:
  1407. cluster_env: app_config_ml.yaml
  1408. cluster_compute: tpl_aws_4x2.yaml
  1409. run:
  1410. timeout: 600
  1411. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
  1412. --bucket s3://tune-cloud-tests/durable_upload_rllib_str
  1413. wait_for_nodes:
  1414. num_nodes: 4
  1415. variations:
  1416. - __suffix__: aws
  1417. - __suffix__: gce
  1418. env: gce
  1419. frequency: manual
  1420. cluster:
  1421. cluster_env: app_config_ml.yaml
  1422. cluster_compute: tpl_gce_4x2.yaml
  1423. run:
  1424. timeout: 600
  1425. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
  1426. --bucket gs://tune-cloud-tests/durable_upload_rllib_str
  1427. wait_for_nodes:
  1428. num_nodes: 4
  1429. alert: tune_tests
  1430. - name: tune_cloud_durable_upload_rllib_trainer
  1431. group: Tune cloud tests
  1432. working_dir: tune_tests/cloud_tests
  1433. stable: false
  1434. frequency: nightly
  1435. team: ml
  1436. cluster:
  1437. cluster_env: app_config_ml.yaml
  1438. cluster_compute: tpl_aws_4x2.yaml
  1439. run:
  1440. timeout: 600
  1441. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer
  1442. --bucket s3://tune-cloud-tests/durable_upload_rllib_trainer
  1443. wait_for_nodes:
  1444. num_nodes: 4
  1445. variations:
  1446. - __suffix__: aws
  1447. - __suffix__: gce
  1448. env: gce
  1449. frequency: manual
  1450. cluster:
  1451. cluster_env: app_config_ml.yaml
  1452. cluster_compute: tpl_gce_4x2.yaml
  1453. run:
  1454. timeout: 600
  1455. script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
  1456. --bucket gs://tune-cloud-tests/durable_upload_rllib_trainer
  1457. wait_for_nodes:
  1458. num_nodes: 4
  1459. alert: tune_tests
  1460. ########################
  1461. # Tune scalability tests
  1462. ########################
  1463. - name: tune_scalability_bookkeeping_overhead
  1464. group: Tune scalability tests
  1465. working_dir: tune_tests/scalability_tests
  1466. frequency: nightly
  1467. team: ml
  1468. cluster:
  1469. cluster_env: app_config.yaml
  1470. cluster_compute: tpl_1x16.yaml
  1471. run:
  1472. timeout: 1200
  1473. script: python workloads/test_bookkeeping_overhead.py
  1474. alert: tune_tests
  1475. variations:
  1476. - __suffix__: aws
  1477. - __suffix__: gce
  1478. env: gce
  1479. frequency: manual
  1480. cluster:
  1481. cluster_env: app_config.yaml
  1482. cluster_compute: tpl_gce_1x16.yaml
  1483. - name: tune_scalability_durable_trainable
  1484. group: Tune scalability tests
  1485. working_dir: tune_tests/scalability_tests
  1486. frequency: nightly
  1487. team: ml
  1488. cluster:
  1489. cluster_env: app_config.yaml
  1490. cluster_compute: tpl_16x2.yaml
  1491. run:
  1492. timeout: 900
  1493. script: python workloads/test_durable_trainable.py --bucket s3://tune-cloud-tests/scalability_durable_trainable
  1494. wait_for_nodes:
  1495. num_nodes: 16
  1496. variations:
  1497. - __suffix__: aws
  1498. - __suffix__: gce
  1499. env: gce
  1500. frequency: manual
  1501. run:
  1502. timeout: 900
  1503. script: python workloads/test_durable_trainable.py --bucket gs://tune-cloud-tests/scalability_durable_trainable
  1504. wait_for_nodes:
  1505. num_nodes: 16
  1506. cluster:
  1507. cluster_env: app_config.yaml
  1508. cluster_compute: tpl_gce_16x2.yaml
  1509. alert: tune_tests
  1510. - name: tune_scalability_durable_multifile_checkpoints
  1511. group: Tune scalability tests
  1512. working_dir: tune_tests/scalability_tests
  1513. frequency: nightly
  1514. team: ml
  1515. cluster:
  1516. cluster_env: app_config.yaml
  1517. cluster_compute: tpl_16x2.yaml
  1518. run:
  1519. timeout: 900
  1520. script: python workloads/test_durable_multifile_checkpoints.py --bucket s3://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1521. wait_for_nodes:
  1522. num_nodes: 16
  1523. variations:
  1524. - __suffix__: aws
  1525. - __suffix__: gce
  1526. env: gce
  1527. frequency: manual
  1528. run:
  1529. timeout: 900
  1530. script: python workloads/test_durable_multifile_checkpoints.py --bucket gs://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1531. wait_for_nodes:
  1532. num_nodes: 16
  1533. cluster:
  1534. cluster_env: app_config.yaml
  1535. cluster_compute: tpl_gce_16x2.yaml
  1536. alert: tune_tests
  1537. - name: tune_scalability_long_running_large_checkpoints
  1538. group: Tune scalability tests
  1539. working_dir: tune_tests/scalability_tests
  1540. frequency: weekly
  1541. team: ml
  1542. cluster:
  1543. cluster_env: app_config.yaml
  1544. cluster_compute: tpl_1x32_hd.yaml
  1545. run:
  1546. timeout: 86400
  1547. script: python workloads/test_long_running_large_checkpoints.py
  1548. long_running: true
  1549. smoke_test:
  1550. frequency: nightly
  1551. run:
  1552. timeout: 3600
  1553. alert: tune_tests
  1554. variations:
  1555. - __suffix__: aws
  1556. - __suffix__: gce
  1557. env: gce
  1558. frequency: manual
  1559. smoke_test:
  1560. frequency: manual
  1561. cluster:
  1562. cluster_env: app_config.yaml
  1563. cluster_compute: tpl_gce_1x32_hd.yaml
  1564. - name: tune_scalability_network_overhead
  1565. group: Tune scalability tests
  1566. working_dir: tune_tests/scalability_tests
  1567. frequency: weekly
  1568. team: ml
  1569. cluster:
  1570. cluster_env: app_config.yaml
  1571. cluster_compute: tpl_100x2.yaml
  1572. run:
  1573. timeout: 900
  1574. prepare_timeout: 1200
  1575. script: python workloads/test_network_overhead.py
  1576. wait_for_nodes:
  1577. num_nodes: 100
  1578. alert: tune_tests
  1579. variations:
  1580. - __suffix__: aws
  1581. - __suffix__: smoke-test
  1582. frequency: nightly
  1583. cluster:
  1584. cluster_env: app_config.yaml
  1585. cluster_compute: tpl_20x2.yaml
  1586. run:
  1587. timeout: 500
  1588. prepare_timeout: 600
  1589. script: python workloads/test_network_overhead.py --smoke-test
  1590. wait_for_nodes:
  1591. num_nodes: 20
  1592. - __suffix__: gce
  1593. env: gce
  1594. frequency: manual
  1595. cluster:
  1596. cluster_env: app_config.yaml
  1597. cluster_compute: tpl_gce_100x2.yaml
  1598. - name: tune_scalability_result_throughput_cluster
  1599. group: Tune scalability tests
  1600. working_dir: tune_tests/scalability_tests
  1601. frequency: nightly-3x
  1602. team: ml
  1603. cluster:
  1604. cluster_env: app_config.yaml
  1605. cluster_compute: tpl_16x64.yaml
  1606. run:
  1607. timeout: 600
  1608. script: python workloads/test_result_throughput_cluster.py
  1609. wait_for_nodes:
  1610. num_nodes: 16
  1611. alert: tune_tests
  1612. variations:
  1613. - __suffix__: aws
  1614. - __suffix__: gce
  1615. env: gce
  1616. frequency: manual
  1617. cluster:
  1618. cluster_env: app_config.yaml
  1619. cluster_compute: tpl_gce_16x64.yaml
  1620. - name: tune_scalability_result_throughput_single_node
  1621. group: Tune scalability tests
  1622. working_dir: tune_tests/scalability_tests
  1623. frequency: nightly
  1624. team: ml
  1625. cluster:
  1626. cluster_env: app_config.yaml
  1627. cluster_compute: tpl_1x96.yaml
  1628. run:
  1629. timeout: 600
  1630. script: python workloads/test_result_throughput_single_node.py
  1631. alert: tune_tests
  1632. variations:
  1633. - __suffix__: aws
  1634. - __suffix__: gce
  1635. env: gce
  1636. frequency: manual
  1637. cluster:
  1638. cluster_env: app_config.yaml
  1639. cluster_compute: tpl_gce_1x96.yaml
  1640. run:
  1641. timeout: 600
  1642. script: python workloads/test_result_throughput_single_node.py
  1643. type: anyscale_job
  1644. - name: tune_scalability_xgboost_sweep
  1645. group: Tune scalability tests
  1646. working_dir: tune_tests/scalability_tests
  1647. frequency: weekly
  1648. team: ml
  1649. cluster:
  1650. cluster_env: app_config_data.yaml
  1651. cluster_compute: tpl_16x64.yaml
  1652. run:
  1653. timeout: 3600
  1654. script: python workloads/test_xgboost_sweep.py
  1655. wait_for_nodes:
  1656. num_nodes: 16
  1657. alert: tune_tests
  1658. variations:
  1659. - __suffix__: aws
  1660. - __suffix__: gce
  1661. env: gce
  1662. frequency: manual
  1663. cluster:
  1664. cluster_env: app_config_data.yaml
  1665. cluster_compute: tpl_gce_16x64.yaml
  1666. ############################
  1667. # Tune fault tolerance tests
  1668. ############################
  1669. - name: tune_worker_fault_tolerance
  1670. group: Tune fault tolerance tests
  1671. working_dir: tune_tests/fault_tolerance_tests
  1672. stable: true
  1673. frequency: nightly-3x
  1674. team: ml
  1675. cluster:
  1676. cluster_env: app_config.yaml
  1677. cluster_compute: tpl_aws_16x1.yaml
  1678. run:
  1679. timeout: 5400
  1680. script: python workloads/test_tune_worker_fault_tolerance.py --bucket s3://tune-cloud-tests/worker_fault_tolerance
  1681. wait_for_nodes:
  1682. num_nodes: 16
  1683. # Disabled until we can kill nodes in GCE
  1684. # variations:
  1685. # - __suffix__: aws
  1686. # - __suffix__: gce
  1687. # env: gce
  1688. # frequency: manual
  1689. # run:
  1690. # timeout: 5400
  1691. # script: python workloads/test_tune_worker_fault_tolerance.py --bucket gs://tune-cloud-tests/worker_fault_tolerance
  1692. #
  1693. # wait_for_nodes:
  1694. # num_nodes: 16
  1695. # cluster:
  1696. # cluster_env: app_config.yaml
  1697. # cluster_compute: tpl_gce_16x1.yaml
  1698. ########################
  1699. # Golden Notebook tests
  1700. ########################
  1701. - name: golden_notebook_torch_tune_serve_test
  1702. group: Golden Notebook tests
  1703. working_dir: golden_notebook_tests
  1704. frequency: nightly-3x
  1705. team: ml
  1706. cluster:
  1707. cluster_env: torch_tune_serve_app_config.yaml
  1708. cluster_compute: gpu_tpl_aws.yaml
  1709. run:
  1710. timeout: 600
  1711. script: python workloads/torch_tune_serve_test.py
  1712. wait_for_nodes:
  1713. num_nodes: 2
  1714. variations:
  1715. - __suffix__: aws
  1716. - __suffix__: gce
  1717. env: gce
  1718. frequency: manual
  1719. cluster:
  1720. cluster_env: torch_tune_serve_app_config.yaml
  1721. cluster_compute: gpu_tpl_gce.yaml
  1722. alert: default
  1723. #######################
  1724. # Long running tests
  1725. #######################
  1726. - name: long_running_actor_deaths
  1727. group: Long running tests
  1728. working_dir: long_running_tests
  1729. frequency: weekly
  1730. team: core
  1731. cluster:
  1732. cluster_env: app_config.yaml
  1733. cluster_compute: tpl_cpu_1.yaml
  1734. run:
  1735. timeout: 86400
  1736. script: python workloads/actor_deaths.py
  1737. long_running: true
  1738. smoke_test:
  1739. frequency: nightly
  1740. run:
  1741. timeout: 3600
  1742. alert: long_running_tests
  1743. variations:
  1744. - __suffix__: aws
  1745. - __suffix__: gce
  1746. env: gce
  1747. frequency: manual
  1748. smoke_test:
  1749. frequency: manual
  1750. cluster:
  1751. cluster_env: app_config.yaml
  1752. cluster_compute: tpl_cpu_1_gce.yaml
  1753. - name: long_running_apex
  1754. group: Long running tests
  1755. working_dir: long_running_tests
  1756. frequency: weekly
  1757. team: rllib
  1758. cluster:
  1759. cluster_env: ../rllib_tests/app_config.yaml
  1760. cluster_compute: tpl_cpu_3.yaml
  1761. run:
  1762. timeout: 86400
  1763. script: python workloads/apex.py
  1764. long_running: true
  1765. wait_for_nodes:
  1766. num_nodes: 3
  1767. smoke_test:
  1768. frequency: nightly
  1769. run:
  1770. timeout: 3600
  1771. alert: long_running_tests
  1772. variations:
  1773. - __suffix__: aws
  1774. - __suffix__: gce
  1775. env: gce
  1776. frequency: manual
  1777. smoke_test:
  1778. frequency: manual
  1779. run:
  1780. timeout: 3600
  1781. cluster:
  1782. cluster_env: ../rllib_tests/app_config.yaml
  1783. cluster_compute: tpl_cpu_3_gce.yaml
  1784. - name: long_running_impala
  1785. group: Long running tests
  1786. working_dir: long_running_tests
  1787. frequency: weekly
  1788. team: rllib
  1789. cluster:
  1790. cluster_env: ../rllib_tests/app_config.yaml
  1791. cluster_compute: tpl_cpu_1_large.yaml
  1792. run:
  1793. timeout: 86400
  1794. script: python workloads/impala.py
  1795. long_running: true
  1796. smoke_test:
  1797. frequency: nightly
  1798. run:
  1799. timeout: 3600
  1800. alert: long_running_tests
  1801. variations:
  1802. - __suffix__: aws
  1803. - __suffix__: gce
  1804. env: gce
  1805. frequency: manual
  1806. smoke_test:
  1807. frequency: manual
  1808. run:
  1809. timeout: 3600
  1810. cluster:
  1811. cluster_env: ../rllib_tests/app_config.yaml
  1812. cluster_compute: tpl_cpu_1_large_gce.yaml
  1813. - name: long_running_many_actor_tasks
  1814. group: Long running tests
  1815. working_dir: long_running_tests
  1816. frequency: weekly
  1817. team: core
  1818. cluster:
  1819. cluster_env: app_config.yaml
  1820. cluster_compute: tpl_cpu_1.yaml
  1821. run:
  1822. timeout: 86400
  1823. script: python workloads/many_actor_tasks.py
  1824. long_running: true
  1825. smoke_test:
  1826. frequency: nightly
  1827. run:
  1828. timeout: 3600
  1829. alert: long_running_tests
  1830. variations:
  1831. - __suffix__: aws
  1832. - __suffix__: gce
  1833. env: gce
  1834. frequency: manual
  1835. smoke_test:
  1836. frequency: manual
  1837. run:
  1838. timeout: 3600
  1839. cluster:
  1840. cluster_env: app_config.yaml
  1841. cluster_compute: tpl_cpu_1_gce.yaml
  1842. - name: long_running_many_drivers
  1843. group: Long running tests
  1844. working_dir: long_running_tests
  1845. frequency: weekly
  1846. team: core
  1847. cluster:
  1848. cluster_env: app_config.yaml
  1849. cluster_compute: tpl_cpu_1.yaml
  1850. run:
  1851. timeout: 86400
  1852. script: python workloads/many_drivers.py --iteration-num=4000
  1853. long_running: true
  1854. smoke_test:
  1855. frequency: nightly
  1856. run:
  1857. timeout: 3600
  1858. alert: long_running_tests
  1859. variations:
  1860. - __suffix__: aws
  1861. - __suffix__: gce
  1862. env: gce
  1863. frequency: manual
  1864. smoke_test:
  1865. frequency: manual
  1866. run:
  1867. timeout: 3600
  1868. cluster:
  1869. cluster_env: app_config.yaml
  1870. cluster_compute: tpl_cpu_1_gce.yaml
  1871. - name: long_running_many_ppo
  1872. group: Long running tests
  1873. working_dir: long_running_tests
  1874. stable: false
  1875. frequency: weekly
  1876. team: ml
  1877. cluster:
  1878. cluster_env: ../rllib_tests/app_config.yaml
  1879. cluster_compute: many_ppo.yaml
  1880. run:
  1881. timeout: 86400
  1882. script: python workloads/many_ppo.py
  1883. long_running: true
  1884. wait_for_nodes:
  1885. num_nodes: 1
  1886. smoke_test:
  1887. frequency: nightly
  1888. run:
  1889. timeout: 3600
  1890. alert: long_running_tests
  1891. variations:
  1892. - __suffix__: aws
  1893. - __suffix__: gce
  1894. env: gce
  1895. frequency: manual
  1896. smoke_test:
  1897. frequency: manual
  1898. run:
  1899. timeout: 3600
  1900. cluster:
  1901. cluster_env: ../rllib_tests/app_config.yaml
  1902. cluster_compute: many_ppo_gce.yaml
  1903. - name: long_running_many_tasks
  1904. group: Long running tests
  1905. working_dir: long_running_tests
  1906. frequency: weekly
  1907. team: core
  1908. cluster:
  1909. cluster_env: app_config.yaml
  1910. cluster_compute: tpl_cpu_1.yaml
  1911. run:
  1912. timeout: 86400
  1913. script: python workloads/many_tasks.py
  1914. long_running: true
  1915. smoke_test:
  1916. frequency: nightly
  1917. run:
  1918. timeout: 3600
  1919. alert: long_running_tests
  1920. variations:
  1921. - __suffix__: aws
  1922. - __suffix__: gce
  1923. env: gce
  1924. frequency: manual
  1925. smoke_test:
  1926. frequency: manual
  1927. run:
  1928. timeout: 3600
  1929. cluster:
  1930. cluster_env: app_config.yaml
  1931. cluster_compute: tpl_cpu_1_gce.yaml
  1932. - name: long_running_many_tasks_serialized_ids
  1933. group: Long running tests
  1934. working_dir: long_running_tests
  1935. frequency: weekly
  1936. team: core
  1937. cluster:
  1938. cluster_env: app_config.yaml
  1939. cluster_compute: tpl_cpu_1.yaml
  1940. run:
  1941. timeout: 86400
  1942. script: python workloads/many_tasks_serialized_ids.py
  1943. long_running: true
  1944. smoke_test:
  1945. frequency: nightly
  1946. run:
  1947. timeout: 3600
  1948. alert: long_running_tests
  1949. variations:
  1950. - __suffix__: aws
  1951. - __suffix__: gce
  1952. env: gce
  1953. frequency: manual
  1954. smoke_test:
  1955. frequency: manual
  1956. run:
  1957. timeout: 3600
  1958. cluster:
  1959. cluster_env: app_config.yaml
  1960. cluster_compute: tpl_cpu_1_gce.yaml
  1961. - name: long_running_node_failures
  1962. group: Long running tests
  1963. working_dir: long_running_tests
  1964. frequency: weekly
  1965. team: core
  1966. cluster:
  1967. cluster_env: app_config.yaml
  1968. cluster_compute: tpl_cpu_1.yaml
  1969. run:
  1970. timeout: 86400
  1971. script: python workloads/node_failures.py
  1972. long_running: true
  1973. smoke_test:
  1974. frequency: nightly
  1975. run:
  1976. timeout: 3600
  1977. alert: long_running_tests
  1978. variations:
  1979. - __suffix__: aws
  1980. - __suffix__: gce
  1981. env: gce
  1982. frequency: manual
  1983. smoke_test:
  1984. frequency: manual
  1985. run:
  1986. timeout: 3600
  1987. cluster:
  1988. cluster_env: app_config.yaml
  1989. cluster_compute: tpl_cpu_1_gce.yaml
  1990. - name: long_running_pbt
  1991. group: Long running tests
  1992. working_dir: long_running_tests
  1993. frequency: weekly
  1994. team: ml
  1995. cluster:
  1996. cluster_env: ../rllib_tests/app_config.yaml
  1997. cluster_compute: tpl_cpu_1.yaml
  1998. run:
  1999. timeout: 86400
  2000. script: python workloads/pbt.py
  2001. long_running: true
  2002. smoke_test:
  2003. frequency: nightly
  2004. run:
  2005. timeout: 3600
  2006. alert: long_running_tests
  2007. variations:
  2008. - __suffix__: aws
  2009. - __suffix__: gce
  2010. env: gce
  2011. frequency: manual
  2012. smoke_test:
  2013. frequency: manual
  2014. run:
  2015. timeout: 3600
  2016. cluster:
  2017. cluster_env: ../rllib_tests/app_config.yaml
  2018. cluster_compute: tpl_cpu_1_gce.yaml
  2019. - name: long_running_serve
  2020. group: Long running tests
  2021. working_dir: long_running_tests
  2022. frequency: weekly
  2023. team: serve
  2024. cluster:
  2025. cluster_env: app_config.yaml
  2026. cluster_compute: tpl_cpu_1.yaml
  2027. run:
  2028. timeout: 86400
  2029. script: python workloads/serve.py
  2030. long_running: true
  2031. smoke_test:
  2032. frequency: nightly
  2033. run:
  2034. timeout: 3600
  2035. alert: long_running_tests
  2036. variations:
  2037. - __suffix__: aws
  2038. - __suffix__: gce
  2039. env: gce
  2040. frequency: manual
  2041. smoke_test:
  2042. frequency: manual
  2043. run:
  2044. timeout: 3600
  2045. cluster:
  2046. cluster_env: app_config.yaml
  2047. cluster_compute: tpl_cpu_1_gce.yaml
  2048. - name: long_running_serve_failure
  2049. group: Long running tests
  2050. working_dir: long_running_tests
  2051. stable: true
  2052. frequency: weekly
  2053. team: serve
  2054. cluster:
  2055. cluster_env: app_config.yaml
  2056. cluster_compute: tpl_cpu_1_c5.yaml
  2057. run:
  2058. timeout: 86400
  2059. script: python workloads/serve_failure.py
  2060. long_running: true
  2061. smoke_test:
  2062. frequency: nightly
  2063. run:
  2064. timeout: 600
  2065. alert: long_running_tests
  2066. variations:
  2067. - __suffix__: aws
  2068. - __suffix__: gce
  2069. env: gce
  2070. frequency: manual
  2071. smoke_test:
  2072. frequency: manual
  2073. run:
  2074. timeout: 86400
  2075. cluster:
  2076. cluster_env: app_config.yaml
  2077. cluster_compute: tpl_cpu_1_c5_gce.yaml
  2078. - name: long_running_many_jobs
  2079. group: Long running tests
  2080. working_dir: long_running_tests
  2081. stable: true
  2082. frequency: weekly
  2083. team: serve
  2084. cluster:
  2085. cluster_env: app_config.yaml
  2086. cluster_compute: tpl_cpu_1.yaml
  2087. run:
  2088. timeout: 86400
  2089. script: python workloads/long_running_many_jobs.py --num-clients=1
  2090. long_running: true
  2091. smoke_test:
  2092. frequency: nightly
  2093. run:
  2094. timeout: 1800
  2095. alert: long_running_tests
  2096. variations:
  2097. - __suffix__: aws
  2098. - __suffix__: gce
  2099. env: gce
  2100. frequency: manual
  2101. smoke_test:
  2102. frequency: manual
  2103. run:
  2104. timeout: 3600
  2105. cluster:
  2106. cluster_env: app_config.yaml
  2107. cluster_compute: tpl_cpu_1_gce.yaml
  2108. - name: long_running_distributed_pytorch_pbt_failure
  2109. group: Long running tests
  2110. working_dir: long_running_distributed_tests
  2111. frequency: weekly
  2112. team: ml
  2113. cluster:
  2114. cluster_env: app_config.yaml
  2115. cluster_compute: compute_tpl.yaml
  2116. run:
  2117. timeout: 86400
  2118. script: python workloads/pytorch_pbt_failure.py
  2119. long_running: true
  2120. smoke_test:
  2121. frequency: manual
  2122. run:
  2123. timeout: 3600
  2124. alert: long_running_tests
  2125. variations:
  2126. - __suffix__: aws
  2127. - __suffix__: gce
  2128. env: gce
  2129. frequency: manual
  2130. smoke_test:
  2131. frequency: manual
  2132. run:
  2133. timeout: 3600
  2134. cluster:
  2135. cluster_env: app_config.yaml
  2136. cluster_compute: compute_tpl_gce.yaml
  2137. ########################
  2138. # Jobs tests
  2139. ########################
  2140. - name: jobs_basic_local_working_dir
  2141. group: Jobs tests
  2142. working_dir: jobs_tests
  2143. frequency: nightly
  2144. team: serve
  2145. cluster:
  2146. cluster_env: app_config.yaml
  2147. cluster_compute: compute_tpl_4_xlarge.yaml
  2148. run:
  2149. timeout: 600
  2150. script: python workloads/jobs_basic.py --working-dir "workloads"
  2151. wait_for_nodes:
  2152. num_nodes: 4
  2153. alert: default
  2154. variations:
  2155. - __suffix__: aws
  2156. - __suffix__: gce
  2157. env: gce
  2158. frequency: manual
  2159. cluster:
  2160. cluster_env: app_config.yaml
  2161. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  2162. - name: jobs_basic_remote_working_dir
  2163. group: Jobs tests
  2164. working_dir: jobs_tests
  2165. frequency: nightly
  2166. team: serve
  2167. cluster:
  2168. cluster_env: app_config.yaml
  2169. cluster_compute: compute_tpl_4_xlarge.yaml
  2170. run:
  2171. timeout: 600
  2172. script: python workloads/jobs_basic.py --working-dir "https://github.com/anyscale/job-services-cuj-examples/archive/refs/heads/main.zip"
  2173. wait_for_nodes:
  2174. num_nodes: 4
  2175. alert: default
  2176. variations:
  2177. - __suffix__: aws
  2178. - __suffix__: gce
  2179. env: gce
  2180. frequency: manual
  2181. cluster:
  2182. cluster_env: app_config.yaml
  2183. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  2184. - name: jobs_remote_multi_node
  2185. group: Jobs tests
  2186. team: serve
  2187. frequency: nightly
  2188. working_dir: jobs_tests
  2189. cluster:
  2190. cluster_env: app_config.yaml
  2191. cluster_compute: compute_tpl_4_xlarge.yaml
  2192. run:
  2193. timeout: 600
  2194. script: python workloads/jobs_remote_multi_node.py
  2195. wait_for_nodes:
  2196. num_nodes: 4
  2197. variations:
  2198. - __suffix__: aws
  2199. - __suffix__: gce
  2200. env: gce
  2201. frequency: manual
  2202. cluster:
  2203. cluster_env: app_config.yaml
  2204. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  2205. - name: jobs_check_cuda_available
  2206. group: Jobs tests
  2207. team: serve
  2208. frequency: nightly
  2209. working_dir: jobs_tests
  2210. cluster:
  2211. cluster_env: app_config.yaml
  2212. cluster_compute: compute_tpl_gpu_node.yaml
  2213. run:
  2214. timeout: 600
  2215. script: python workloads/jobs_check_cuda_available.py
  2216. wait_for_nodes:
  2217. num_nodes: 2
  2218. variations:
  2219. - __suffix__: aws
  2220. - __suffix__: gce
  2221. env: gce
  2222. frequency: manual
  2223. cluster:
  2224. cluster_env: app_config.yaml
  2225. cluster_compute: compute_tpl_gce_gpu_node.yaml
  2226. - name: jobs_specify_num_gpus
  2227. group: Jobs tests
  2228. team: serve
  2229. frequency: nightly
  2230. working_dir: jobs_tests
  2231. cluster:
  2232. cluster_env: app_config.yaml
  2233. cluster_compute: compute_tpl_gpu_worker.yaml
  2234. run:
  2235. timeout: 600
  2236. script: python workloads/jobs_specify_num_gpus.py --working-dir "workloads"
  2237. wait_for_nodes:
  2238. num_nodes: 2
  2239. variations:
  2240. - __suffix__: aws
  2241. - __suffix__: gce
  2242. env: gce
  2243. frequency: manual
  2244. cluster:
  2245. cluster_env: app_config.yaml
  2246. cluster_compute: compute_tpl_gce_gpu_worker.yaml
  2247. ########################
  2248. # Runtime env tests
  2249. ########################
  2250. - name: runtime_env_rte_many_tasks_actors
  2251. group: Runtime env tests
  2252. working_dir: runtime_env_tests
  2253. frequency: nightly
  2254. team: serve
  2255. cluster:
  2256. cluster_env: app_config.yaml
  2257. cluster_compute: rte_small.yaml
  2258. run:
  2259. timeout: 600
  2260. script: python workloads/rte_many_tasks_actors.py
  2261. wait_for_nodes:
  2262. num_nodes: 4
  2263. alert: default
  2264. variations:
  2265. - __suffix__: aws
  2266. - __suffix__: gce
  2267. env: gce
  2268. frequency: manual
  2269. cluster:
  2270. cluster_env: app_config.yaml
  2271. cluster_compute: rte_gce_small.yaml
  2272. - name: runtime_env_wheel_urls
  2273. group: Runtime env tests
  2274. working_dir: runtime_env_tests
  2275. frequency: nightly
  2276. team: serve
  2277. cluster:
  2278. cluster_env: app_config.yaml
  2279. cluster_compute: rte_minimal.yaml
  2280. run:
  2281. timeout: 9000
  2282. script: python workloads/wheel_urls.py
  2283. wait_for_nodes:
  2284. num_nodes: 1
  2285. alert: default
  2286. variations:
  2287. - __suffix__: aws
  2288. - __suffix__: gce
  2289. env: gce
  2290. frequency: manual
  2291. cluster:
  2292. cluster_env: app_config.yaml
  2293. cluster_compute: rte_gce_minimal.yaml
  2294. # It seems like the consensus is that this should be tested in CI, and not in a nightly test.
  2295. # - name: runtime_env_rte_ray_client
  2296. # group: Runtime env tests
  2297. # working_dir: runtime_env_tests
  2298. # frequency: nightly
  2299. # team: serve
  2300. # cluster:
  2301. # cluster_env: app_config.yaml
  2302. # cluster_compute: rte_minimal.yaml
  2303. # run:
  2304. # timeout: 600
  2305. # script: python workloads/rte_ray_client.py
  2306. # wait_for_nodes:
  2307. # num_nodes: 1
  2308. # type: anyscale_job
  2309. # alert: default
  2310. ########################
  2311. # Serve tests
  2312. ########################
  2313. - name: serve_single_deployment_1k_noop_replica
  2314. group: Serve tests
  2315. working_dir: serve_tests
  2316. frequency: nightly
  2317. team: serve
  2318. cluster:
  2319. cluster_env: app_config.yaml
  2320. cluster_compute: compute_tpl_32_cpu.yaml
  2321. run:
  2322. timeout: 7200
  2323. long_running: false
  2324. script: python workloads/single_deployment_1k_noop_replica.py
  2325. alert: default
  2326. variations:
  2327. - __suffix__: aws
  2328. - __suffix__: gce
  2329. env: gce
  2330. frequency: manual
  2331. cluster:
  2332. cluster_env: app_config.yaml
  2333. cluster_compute: compute_tpl_32_cpu_gce.yaml
  2334. - name: serve_multi_deployment_1k_noop_replica
  2335. group: Serve tests
  2336. working_dir: serve_tests
  2337. frequency: nightly
  2338. team: serve
  2339. cluster:
  2340. cluster_env: app_config.yaml
  2341. cluster_compute: compute_tpl_32_cpu.yaml
  2342. run:
  2343. timeout: 7200
  2344. long_running: false
  2345. script: python workloads/multi_deployment_1k_noop_replica.py
  2346. alert: default
  2347. variations:
  2348. - __suffix__: aws
  2349. - __suffix__: gce
  2350. env: gce
  2351. frequency: manual
  2352. cluster:
  2353. cluster_env: app_config.yaml
  2354. cluster_compute: compute_tpl_32_cpu_gce.yaml
  2355. - name: serve_autoscaling_single_deployment
  2356. group: Serve tests
  2357. working_dir: serve_tests
  2358. frequency: nightly
  2359. team: serve
  2360. cluster:
  2361. cluster_env: app_config.yaml
  2362. cluster_compute: compute_tpl_8_cpu_autoscaling.yaml
  2363. run:
  2364. timeout: 7200
  2365. long_running: false
  2366. script: python workloads/autoscaling_single_deployment.py
  2367. alert: default
  2368. variations:
  2369. - __suffix__: aws
  2370. - __suffix__: gce
  2371. env: gce
  2372. frequency: manual
  2373. cluster:
  2374. cluster_env: app_config.yaml
  2375. cluster_compute: compute_tpl_8_cpu_autoscaling_gce.yaml
  2376. - name: serve_autoscaling_multi_deployment
  2377. group: Serve tests
  2378. working_dir: serve_tests
  2379. frequency: nightly
  2380. team: serve
  2381. cluster:
  2382. cluster_env: app_config.yaml
  2383. cluster_compute: compute_tpl_32_cpu_autoscaling.yaml
  2384. run:
  2385. timeout: 7200
  2386. long_running: false
  2387. script: python workloads/autoscaling_multi_deployment.py
  2388. alert: default
  2389. variations:
  2390. - __suffix__: aws
  2391. - __suffix__: gce
  2392. env: gce
  2393. frequency: manual
  2394. cluster:
  2395. cluster_env: app_config.yaml
  2396. cluster_compute: compute_tpl_32_cpu_autoscaling_gce.yaml
  2397. - name: serve_serve_micro_benchmark
  2398. group: Serve tests
  2399. working_dir: serve_tests
  2400. frequency: nightly
  2401. team: serve
  2402. cluster:
  2403. cluster_env: app_config.yaml
  2404. cluster_compute: compute_tpl_single_node.yaml
  2405. run:
  2406. timeout: 7200
  2407. long_running: false
  2408. script: python workloads/serve_micro_benchmark.py
  2409. alert: default
  2410. variations:
  2411. - __suffix__: aws
  2412. - __suffix__: gce
  2413. env: gce
  2414. frequency: manual
  2415. cluster:
  2416. cluster_env: app_config.yaml
  2417. cluster_compute: compute_tpl_single_node_gce.yaml
  2418. # - name: serve_serve_micro_benchmark_k8s
  2419. # group: Serve tests
  2420. # working_dir: serve_tests
  2421. # # TODO(architkulkarni) Reenable after K8s migration. Currently failing
  2422. # frequency: manual
  2423. # team: serve
  2424. # cluster:
  2425. # cluster_env: app_config.yaml
  2426. # cluster_compute: compute_tpl_single_node_k8s.yaml
  2427. # run:
  2428. # timeout: 7200
  2429. # long_running: false
  2430. # script: python workloads/serve_micro_benchmark.py
  2431. # alert: default
  2432. - name: deployment_graph_long_chain
  2433. group: Serve tests
  2434. working_dir: serve_tests
  2435. frequency: nightly
  2436. team: serve
  2437. cluster:
  2438. cluster_env: app_config.yaml
  2439. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2440. run:
  2441. timeout: 3600
  2442. long_running: false
  2443. script: python workloads/deployment_graph_long_chain.py --chain-length=10 --num-clients=4 --local-test=False
  2444. alert: default
  2445. stable: False
  2446. variations:
  2447. - __suffix__: aws
  2448. - __suffix__: gce
  2449. env: gce
  2450. frequency: manual
  2451. cluster:
  2452. cluster_env: app_config.yaml
  2453. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2454. - name: deployment_graph_wide_ensemble
  2455. group: Serve tests
  2456. working_dir: serve_tests
  2457. frequency: nightly
  2458. team: serve
  2459. cluster:
  2460. cluster_env: app_config.yaml
  2461. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2462. run:
  2463. timeout: 3600
  2464. long_running: false
  2465. script: python workloads/deployment_graph_wide_ensemble.py --fanout-degree=10 --num-clients=4 --local-test=False
  2466. alert: default
  2467. stable: False
  2468. variations:
  2469. - __suffix__: aws
  2470. - __suffix__: gce
  2471. env: gce
  2472. frequency: manual
  2473. cluster:
  2474. cluster_env: app_config.yaml
  2475. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2476. - name: serve_handle_long_chain
  2477. group: Serve tests
  2478. working_dir: serve_tests
  2479. frequency: nightly
  2480. team: serve
  2481. cluster:
  2482. cluster_env: app_config.yaml
  2483. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2484. run:
  2485. timeout: 3600
  2486. long_running: false
  2487. script: python workloads/serve_handle_long_chain.py --chain-length=10 --num-clients=4 --local-test=False
  2488. alert: default
  2489. stable: False
  2490. variations:
  2491. - __suffix__: aws
  2492. - __suffix__: gce
  2493. env: gce
  2494. frequency: manual
  2495. cluster:
  2496. cluster_env: app_config.yaml
  2497. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2498. - name: serve_handle_wide_ensemble
  2499. group: Serve tests
  2500. working_dir: serve_tests
  2501. frequency: nightly
  2502. team: serve
  2503. cluster:
  2504. cluster_env: app_config.yaml
  2505. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  2506. run:
  2507. timeout: 3600
  2508. long_running: false
  2509. script: python workloads/serve_handle_wide_ensemble.py --fanout-degree=10 --num-clients=4 --local-test=False
  2510. alert: default
  2511. stable: False
  2512. variations:
  2513. - __suffix__: aws
  2514. - __suffix__: gce
  2515. env: gce
  2516. frequency: manual
  2517. cluster:
  2518. cluster_env: app_config.yaml
  2519. cluster_compute: compute_tpl_single_node_32_cpu_gce.yaml
  2520. - name: serve_micro_protocol_grpc_benchmark
  2521. group: Serve tests
  2522. working_dir: serve_tests
  2523. frequency: nightly
  2524. team: serve
  2525. cluster:
  2526. cluster_env: app_config.yaml
  2527. cluster_compute: compute_tpl_single_node.yaml
  2528. run:
  2529. timeout: 7200
  2530. long_running: false
  2531. script: python workloads/serve_protocol_benchmark.py --data-size=1048576
  2532. alert: default
  2533. variations:
  2534. - __suffix__: aws
  2535. - __suffix__: gce
  2536. env: gce
  2537. frequency: manual
  2538. cluster:
  2539. cluster_env: app_config.yaml
  2540. cluster_compute: compute_tpl_single_node_gce.yaml
  2541. - name: serve_micro_protocol_http_benchmark
  2542. group: Serve tests
  2543. working_dir: serve_tests
  2544. frequency: nightly
  2545. team: serve
  2546. cluster:
  2547. cluster_env: app_config.yaml
  2548. cluster_compute: compute_tpl_single_node.yaml
  2549. run:
  2550. timeout: 7200
  2551. long_running: false
  2552. script: python workloads/serve_protocol_benchmark.py --data-size=1048576 --http-test
  2553. alert: default
  2554. variations:
  2555. - __suffix__: aws
  2556. - __suffix__: gce
  2557. env: gce
  2558. frequency: manual
  2559. cluster:
  2560. cluster_env: app_config.yaml
  2561. cluster_compute: compute_tpl_single_node_gce.yaml
  2562. - name: serve_resnet_benchmark
  2563. group: Serve tests
  2564. working_dir: serve_tests
  2565. frequency: nightly
  2566. team: serve
  2567. cluster:
  2568. cluster_env: gpu_app_config.yaml
  2569. cluster_compute: compute_tpl_gpu_node.yaml
  2570. run:
  2571. timeout: 7200
  2572. long_running: false
  2573. script: python workloads/serve_resnet_benchmark.py --gpu-env
  2574. alert: default
  2575. variations:
  2576. - __suffix__: aws
  2577. - __suffix__: gce
  2578. env: gce
  2579. frequency: manual
  2580. cluster:
  2581. cluster_env: gpu_app_config.yaml
  2582. cluster_compute: compute_tpl_gpu_node_gce.yaml
  2583. ########################
  2584. # Train tests
  2585. ########################
  2586. - name: train_horovod_multi_node_test
  2587. group: Train tests
  2588. working_dir: train_tests/horovod
  2589. frequency: nightly
  2590. team: ml
  2591. cluster:
  2592. cluster_env: app_config.yaml
  2593. cluster_compute: compute_tpl_aws.yaml
  2594. run:
  2595. timeout: 3000
  2596. script: python train_horovod_multi_node_test.py
  2597. wait_for_nodes:
  2598. num_nodes: 2
  2599. variations:
  2600. - __suffix__: aws
  2601. - __suffix__: gce
  2602. env: gce
  2603. frequency: manual
  2604. cluster:
  2605. cluster_env: app_config.yaml
  2606. cluster_compute: compute_tpl_gce.yaml
  2607. alert: default
  2608. ########################
  2609. # Alpa tests
  2610. ########################
  2611. - name: alpa_opt_2_7b_sanity_check
  2612. group: Alpa tests
  2613. working_dir: alpa_tests
  2614. frequency: nightly
  2615. team: ml
  2616. cluster:
  2617. cluster_env: app_config.yaml
  2618. cluster_compute: gpu_2x4_t4_aws.yaml
  2619. run:
  2620. timeout: 3600
  2621. script: bash run_train_opt_2_7b.sh --storage aws
  2622. wait_for_nodes:
  2623. num_nodes: 2
  2624. variations:
  2625. - __suffix__: aws
  2626. - __suffix__: gce
  2627. env: gce
  2628. frequency: manual
  2629. cluster:
  2630. cluster_env: app_config.yaml
  2631. cluster_compute: gpu_2x4_t4_gce.yaml
  2632. run:
  2633. timeout: 3600
  2634. script: bash run_train_opt_2_7b.sh --storage gcs
  2635. wait_for_nodes:
  2636. num_nodes: 2
  2637. alert: default
  2638. - name: alpa_opt_30b_inference
  2639. group: Alpa tests
  2640. working_dir: alpa_tests
  2641. frequency: nightly
  2642. team: ml
  2643. cluster:
  2644. cluster_env: app_config.yaml
  2645. cluster_compute: gpu_1x8_v100_aws.yaml
  2646. run:
  2647. timeout: 3600
  2648. script: bash run_inference_opt_30b.sh --storage aws
  2649. wait_for_nodes:
  2650. num_nodes: 1
  2651. variations:
  2652. - __suffix__: aws
  2653. - __suffix__: gce
  2654. env: gce
  2655. frequency: manual
  2656. cluster:
  2657. cluster_env: app_config.yaml
  2658. cluster_compute: gpu_1x8_v100_gce.yaml
  2659. run:
  2660. timeout: 3600
  2661. script: bash run_inference_opt_30b.sh --storage gcs
  2662. wait_for_nodes:
  2663. num_nodes: 1
  2664. alert: default
  2665. ########################
  2666. # RLlib tests
  2667. ########################
  2668. - name: rllib_learner_group_checkpointing_multinode
  2669. group: RLlib tests
  2670. working_dir: rllib_tests
  2671. frequency: nightly
  2672. team: rllib
  2673. cluster:
  2674. cluster_env: app_config.yaml
  2675. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2676. run:
  2677. timeout: 3600
  2678. script: pytest checkpointing_tests/test_learner_group_checkpointing.py
  2679. wait_for_nodes:
  2680. num_nodes: 3
  2681. alert: default
  2682. variations:
  2683. - __suffix__: aws
  2684. - __suffix__: gce
  2685. env: gce
  2686. frequency: manual
  2687. cluster:
  2688. cluster_env: app_config.yaml
  2689. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2690. - name: rllib_learning_tests_a2c_tf
  2691. group: RLlib tests
  2692. working_dir: rllib_tests
  2693. frequency: nightly
  2694. team: rllib
  2695. cluster:
  2696. cluster_env: app_config.yaml
  2697. cluster_compute: 1gpu_16cpus.yaml
  2698. run:
  2699. timeout: 18000
  2700. script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=tf
  2701. alert: default
  2702. variations:
  2703. - __suffix__: aws
  2704. - __suffix__: gce
  2705. env: gce
  2706. frequency: manual
  2707. cluster:
  2708. cluster_env: app_config.yaml
  2709. cluster_compute: 1gpu_16cpus_gce.yaml
  2710. - name: rllib_learning_tests_a2c_torch
  2711. group: RLlib tests
  2712. working_dir: rllib_tests
  2713. frequency: nightly
  2714. team: rllib
  2715. cluster:
  2716. cluster_env: app_config.yaml
  2717. cluster_compute: 1gpu_16cpus.yaml
  2718. run:
  2719. timeout: 18000
  2720. script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=torch
  2721. alert: default
  2722. variations:
  2723. - __suffix__: aws
  2724. - __suffix__: gce
  2725. env: gce
  2726. frequency: manual
  2727. cluster:
  2728. cluster_env: app_config.yaml
  2729. cluster_compute: 1gpu_16cpus_gce.yaml
  2730. - name: rllib_learning_tests_a3c_tf
  2731. group: RLlib tests
  2732. working_dir: rllib_tests
  2733. frequency: nightly
  2734. team: rllib
  2735. cluster:
  2736. cluster_env: app_config.yaml
  2737. cluster_compute: 32cpus.yaml
  2738. run:
  2739. timeout: 18000
  2740. script: python learning_tests/run.py --yaml-sub-dir=a3c --framework=tf
  2741. alert: default
  2742. variations:
  2743. - __suffix__: aws
  2744. - __suffix__: gce
  2745. env: gce
  2746. frequency: manual
  2747. cluster:
  2748. cluster_env: app_config.yaml
  2749. cluster_compute: 32cpus_gce.yaml
  2750. - name: rllib_learning_tests_apex_tf
  2751. group: RLlib tests
  2752. working_dir: rllib_tests
  2753. # Marking as unstable since it's currently expected to fail.
  2754. stable: false
  2755. frequency: nightly
  2756. team: rllib
  2757. cluster:
  2758. cluster_env: app_config.yaml
  2759. cluster_compute: 1gpu_24cpus.yaml
  2760. run:
  2761. timeout: 18000
  2762. script: python learning_tests/run.py --yaml-sub-dir=apex --framework=tf
  2763. alert: default
  2764. variations:
  2765. - __suffix__: aws
  2766. - __suffix__: gce
  2767. env: gce
  2768. frequency: manual
  2769. cluster:
  2770. cluster_env: app_config.yaml
  2771. cluster_compute: 1gpu_24cpus_gce.yaml
  2772. - name: rllib_learning_tests_apex_torch
  2773. group: RLlib tests
  2774. working_dir: rllib_tests
  2775. frequency: nightly
  2776. team: rllib
  2777. cluster:
  2778. cluster_env: app_config.yaml
  2779. cluster_compute: 1gpu_24cpus.yaml
  2780. run:
  2781. timeout: 18000
  2782. script: python learning_tests/run.py --yaml-sub-dir=apex --framework=torch
  2783. alert: default
  2784. variations:
  2785. - __suffix__: aws
  2786. - __suffix__: gce
  2787. env: gce
  2788. frequency: manual
  2789. cluster:
  2790. cluster_env: app_config.yaml
  2791. cluster_compute: 1gpu_24cpus_gce.yaml
  2792. - name: rllib_learning_tests_appo_tf
  2793. group: RLlib tests
  2794. working_dir: rllib_tests
  2795. frequency: nightly
  2796. team: rllib
  2797. cluster:
  2798. cluster_env: app_config.yaml
  2799. cluster_compute: 2gpus_32cpus.yaml
  2800. run:
  2801. timeout: 18000
  2802. script: python learning_tests/run.py --yaml-sub-dir=appo --framework=tf
  2803. alert: default
  2804. variations:
  2805. - __suffix__: aws
  2806. - __suffix__: gce
  2807. env: gce
  2808. frequency: manual
  2809. cluster:
  2810. cluster_env: app_config.yaml
  2811. cluster_compute: 2gpus_32cpus_gce.yaml
  2812. - name: rllib_learning_tests_appo_torch
  2813. group: RLlib tests
  2814. working_dir: rllib_tests
  2815. # Marking as unstable since it's currently expected to fail.
  2816. stable: false
  2817. frequency: nightly
  2818. team: rllib
  2819. cluster:
  2820. cluster_env: app_config.yaml
  2821. cluster_compute: 2gpus_32cpus.yaml
  2822. run:
  2823. timeout: 18000
  2824. script: python learning_tests/run.py --yaml-sub-dir=appo --framework=torch
  2825. alert: default
  2826. variations:
  2827. - __suffix__: aws
  2828. - __suffix__: gce
  2829. env: gce
  2830. frequency: manual
  2831. cluster:
  2832. cluster_env: app_config.yaml
  2833. cluster_compute: 2gpus_32cpus_gce.yaml
  2834. - name: rllib_learning_tests_bc_tf
  2835. group: RLlib tests
  2836. working_dir: rllib_tests
  2837. frequency: nightly
  2838. team: rllib
  2839. cluster:
  2840. cluster_env: app_config.yaml
  2841. cluster_compute: 1gpu_16cpus.yaml
  2842. run:
  2843. timeout: 18000
  2844. script: python learning_tests/run.py --yaml-sub-dir=bc --framework=tf
  2845. alert: default
  2846. variations:
  2847. - __suffix__: aws
  2848. - __suffix__: gce
  2849. env: gce
  2850. frequency: manual
  2851. cluster:
  2852. cluster_env: app_config.yaml
  2853. cluster_compute: 1gpu_16cpus_gce.yaml
  2854. - name: rllib_learning_tests_bc_torch
  2855. group: RLlib tests
  2856. working_dir: rllib_tests
  2857. frequency: nightly
  2858. team: rllib
  2859. cluster:
  2860. cluster_env: app_config.yaml
  2861. cluster_compute: 1gpu_16cpus.yaml
  2862. run:
  2863. timeout: 18000
  2864. script: python learning_tests/run.py --yaml-sub-dir=bc --framework=torch
  2865. alert: default
  2866. variations:
  2867. - __suffix__: aws
  2868. - __suffix__: gce
  2869. env: gce
  2870. frequency: manual
  2871. cluster:
  2872. cluster_env: app_config.yaml
  2873. cluster_compute: 1gpu_16cpus_gce.yaml
  2874. - name: rllib_learning_tests_cql_tf
  2875. group: RLlib tests
  2876. working_dir: rllib_tests
  2877. frequency: nightly
  2878. team: rllib
  2879. # Marking as unstable since it's currently expected to fail.
  2880. stable: false
  2881. cluster:
  2882. cluster_env: app_config.yaml
  2883. cluster_compute: 1gpu_16cpus.yaml
  2884. run:
  2885. timeout: 18000
  2886. script: python learning_tests/run.py --yaml-sub-dir=cql --framework=tf
  2887. alert: default
  2888. variations:
  2889. - __suffix__: aws
  2890. - __suffix__: gce
  2891. env: gce
  2892. frequency: manual
  2893. cluster:
  2894. cluster_env: app_config.yaml
  2895. cluster_compute: 1gpu_16cpus_gce.yaml
  2896. - name: rllib_learning_tests_cql_torch
  2897. group: RLlib tests
  2898. working_dir: rllib_tests
  2899. # Marking as unstable since it's currently expected to fail.
  2900. stable: false
  2901. frequency: nightly
  2902. team: rllib
  2903. cluster:
  2904. cluster_env: app_config.yaml
  2905. cluster_compute: 1gpu_16cpus.yaml
  2906. run:
  2907. timeout: 18000
  2908. script: python learning_tests/run.py --yaml-sub-dir=cql --framework=torch
  2909. alert: default
  2910. variations:
  2911. - __suffix__: aws
  2912. - __suffix__: gce
  2913. env: gce
  2914. frequency: manual
  2915. cluster:
  2916. cluster_env: app_config.yaml
  2917. cluster_compute: 1gpu_16cpus_gce.yaml
  2918. - name: rllib_learning_tests_ddpg_tf
  2919. group: RLlib tests
  2920. working_dir: rllib_tests
  2921. frequency: nightly
  2922. team: rllib
  2923. cluster:
  2924. cluster_env: app_config.yaml
  2925. cluster_compute: 1gpu_16cpus.yaml
  2926. run:
  2927. timeout: 18000
  2928. script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=tf
  2929. alert: default
  2930. variations:
  2931. - __suffix__: aws
  2932. - __suffix__: gce
  2933. env: gce
  2934. frequency: manual
  2935. cluster:
  2936. cluster_env: app_config.yaml
  2937. cluster_compute: 1gpu_16cpus_gce.yaml
  2938. - name: rllib_learning_tests_ddpg_torch
  2939. group: RLlib tests
  2940. working_dir: rllib_tests
  2941. frequency: nightly
  2942. team: rllib
  2943. cluster:
  2944. cluster_env: app_config.yaml
  2945. cluster_compute: 1gpu_16cpus.yaml
  2946. run:
  2947. timeout: 18000
  2948. script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=torch
  2949. alert: default
  2950. variations:
  2951. - __suffix__: aws
  2952. - __suffix__: gce
  2953. env: gce
  2954. frequency: manual
  2955. cluster:
  2956. cluster_env: app_config.yaml
  2957. cluster_compute: 1gpu_16cpus_gce.yaml
  2958. - name: rllib_learning_tests_dqn_tf
  2959. group: RLlib tests
  2960. working_dir: rllib_tests
  2961. frequency: nightly
  2962. team: rllib
  2963. cluster:
  2964. cluster_env: app_config.yaml
  2965. cluster_compute: 1gpu_16cpus.yaml
  2966. run:
  2967. timeout: 18000
  2968. script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=tf
  2969. alert: default
  2970. variations:
  2971. - __suffix__: aws
  2972. - __suffix__: gce
  2973. env: gce
  2974. frequency: manual
  2975. cluster:
  2976. cluster_env: app_config.yaml
  2977. cluster_compute: 1gpu_16cpus_gce.yaml
  2978. - name: rllib_learning_tests_dqn_torch
  2979. group: RLlib tests
  2980. working_dir: rllib_tests
  2981. # Marking as unstable since it's currently expected to fail.
  2982. stable: false
  2983. frequency: nightly
  2984. team: rllib
  2985. cluster:
  2986. cluster_env: app_config.yaml
  2987. cluster_compute: 1gpu_16cpus.yaml
  2988. run:
  2989. timeout: 18000
  2990. script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=torch
  2991. alert: default
  2992. variations:
  2993. - __suffix__: aws
  2994. - __suffix__: gce
  2995. env: gce
  2996. frequency: manual
  2997. cluster:
  2998. cluster_env: app_config.yaml
  2999. cluster_compute: 1gpu_16cpus_gce.yaml
  3000. - name: rllib_learning_tests_es_tf
  3001. group: RLlib tests
  3002. working_dir: rllib_tests
  3003. frequency: nightly
  3004. team: rllib
  3005. cluster:
  3006. cluster_env: app_config.yaml
  3007. cluster_compute: 2gpus_64cpus.yaml
  3008. run:
  3009. timeout: 18000
  3010. script: python learning_tests/run.py --yaml-sub-dir=es --framework=tf
  3011. alert: default
  3012. variations:
  3013. - __suffix__: aws
  3014. - __suffix__: gce
  3015. env: gce
  3016. frequency: manual
  3017. cluster:
  3018. cluster_env: app_config.yaml
  3019. cluster_compute: 2gpus_64cpus_gce.yaml
  3020. - name: rllib_learning_tests_es_torch
  3021. group: RLlib tests
  3022. working_dir: rllib_tests
  3023. frequency: nightly
  3024. team: rllib
  3025. cluster:
  3026. cluster_env: app_config.yaml
  3027. cluster_compute: 2gpus_64cpus.yaml
  3028. run:
  3029. timeout: 18000
  3030. script: python learning_tests/run.py --yaml-sub-dir=es --framework=torch
  3031. alert: default
  3032. variations:
  3033. - __suffix__: aws
  3034. - __suffix__: gce
  3035. env: gce
  3036. frequency: manual
  3037. cluster:
  3038. cluster_env: app_config.yaml
  3039. cluster_compute: 2gpus_64cpus_gce.yaml
  3040. - name: rllib_learning_tests_impala_tf
  3041. group: RLlib tests
  3042. working_dir: rllib_tests
  3043. frequency: nightly
  3044. team: rllib
  3045. cluster:
  3046. cluster_env: app_config.yaml
  3047. cluster_compute: 1gpu_16cpus.yaml
  3048. run:
  3049. timeout: 18000
  3050. script: python learning_tests/run.py --yaml-sub-dir=impala --framework=tf
  3051. alert: default
  3052. variations:
  3053. - __suffix__: aws
  3054. - __suffix__: gce
  3055. env: gce
  3056. frequency: manual
  3057. cluster:
  3058. cluster_env: app_config.yaml
  3059. cluster_compute: 1gpu_16cpus_gce.yaml
  3060. - name: rllib_learning_tests_impala_torch
  3061. group: RLlib tests
  3062. working_dir: rllib_tests
  3063. frequency: nightly
  3064. team: rllib
  3065. cluster:
  3066. cluster_env: app_config.yaml
  3067. cluster_compute: 1gpu_16cpus.yaml
  3068. run:
  3069. timeout: 18000
  3070. script: python learning_tests/run.py --yaml-sub-dir=impala --framework=torch
  3071. alert: default
  3072. variations:
  3073. - __suffix__: aws
  3074. - __suffix__: gce
  3075. env: gce
  3076. frequency: manual
  3077. cluster:
  3078. cluster_env: app_config.yaml
  3079. cluster_compute: 1gpu_16cpus_gce.yaml
  3080. - name: rllib_learning_tests_marwil_tf
  3081. group: RLlib tests
  3082. working_dir: rllib_tests
  3083. # Marking as unstable since it's currently expected to fail.
  3084. stable: false
  3085. frequency: nightly
  3086. team: rllib
  3087. cluster:
  3088. cluster_env: app_config.yaml
  3089. cluster_compute: 1gpu_16cpus.yaml
  3090. run:
  3091. timeout: 18000
  3092. script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=tf
  3093. alert: default
  3094. variations:
  3095. - __suffix__: aws
  3096. - __suffix__: gce
  3097. env: gce
  3098. frequency: manual
  3099. cluster:
  3100. cluster_env: app_config.yaml
  3101. cluster_compute: 1gpu_16cpus_gce.yaml
  3102. - name: rllib_learning_tests_marwil_torch
  3103. group: RLlib tests
  3104. working_dir: rllib_tests
  3105. # Marking as unstable since it's currently expected to fail.
  3106. stable: false
  3107. frequency: nightly
  3108. team: rllib
  3109. cluster:
  3110. cluster_env: app_config.yaml
  3111. cluster_compute: 1gpu_16cpus.yaml
  3112. run:
  3113. timeout: 18000
  3114. script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=torch
  3115. alert: default
  3116. variations:
  3117. - __suffix__: aws
  3118. - __suffix__: gce
  3119. env: gce
  3120. frequency: manual
  3121. cluster:
  3122. cluster_env: app_config.yaml
  3123. cluster_compute: 1gpu_16cpus_gce.yaml
  3124. - name: rllib_learning_tests_ppo_tf
  3125. group: RLlib tests
  3126. working_dir: rllib_tests
  3127. frequency: nightly
  3128. team: rllib
  3129. cluster:
  3130. cluster_env: app_config.yaml
  3131. cluster_compute: 2gpus_32cpus.yaml
  3132. run:
  3133. timeout: 18000
  3134. script: python learning_tests/run.py --yaml-sub-dir=ppo/tf --framework=tf
  3135. alert: default
  3136. variations:
  3137. - __suffix__: aws
  3138. - __suffix__: gce
  3139. env: gce
  3140. frequency: manual
  3141. cluster:
  3142. cluster_env: app_config.yaml
  3143. cluster_compute: 2gpus_32cpus_gce.yaml
  3144. - name: rllib_learning_tests_ppo_torch
  3145. group: RLlib tests
  3146. working_dir: rllib_tests
  3147. # Marking as unstable since it's currently expected to fail.
  3148. stable: false
  3149. frequency: nightly
  3150. team: rllib
  3151. cluster:
  3152. cluster_env: app_config.yaml
  3153. cluster_compute: 2gpus_32cpus.yaml
  3154. run:
  3155. timeout: 18000
  3156. script: python learning_tests/run.py --yaml-sub-dir=ppo/torch --framework=torch
  3157. alert: default
  3158. variations:
  3159. - __suffix__: aws
  3160. - __suffix__: gce
  3161. env: gce
  3162. frequency: manual
  3163. cluster:
  3164. cluster_env: app_config.yaml
  3165. cluster_compute: 2gpus_32cpus_gce.yaml
  3166. - name: rllib_learning_tests_sac_tf
  3167. group: RLlib tests
  3168. working_dir: rllib_tests
  3169. frequency: nightly
  3170. team: rllib
  3171. cluster:
  3172. cluster_env: app_config.yaml
  3173. cluster_compute: 1gpu_16cpus.yaml
  3174. run:
  3175. timeout: 18000
  3176. script: python learning_tests/run.py --yaml-sub-dir=sac --framework=tf
  3177. alert: default
  3178. variations:
  3179. - __suffix__: aws
  3180. - __suffix__: gce
  3181. env: gce
  3182. frequency: manual
  3183. cluster:
  3184. cluster_env: app_config.yaml
  3185. cluster_compute: 1gpu_16cpus_gce.yaml
  3186. - name: rllib_learning_tests_sac_torch
  3187. group: RLlib tests
  3188. working_dir: rllib_tests
  3189. frequency: nightly
  3190. team: rllib
  3191. cluster:
  3192. cluster_env: app_config.yaml
  3193. cluster_compute: 1gpu_16cpus.yaml
  3194. run:
  3195. timeout: 18000
  3196. script: python learning_tests/run.py --yaml-sub-dir=sac --framework=torch
  3197. alert: default
  3198. variations:
  3199. - __suffix__: aws
  3200. - __suffix__: gce
  3201. env: gce
  3202. frequency: manual
  3203. cluster:
  3204. cluster_env: app_config.yaml
  3205. cluster_compute: 1gpu_16cpus_gce.yaml
  3206. - name: rllib_learning_tests_slateq_tf
  3207. group: RLlib tests
  3208. working_dir: rllib_tests
  3209. frequency: nightly
  3210. team: rllib
  3211. cluster:
  3212. cluster_env: app_config.yaml
  3213. cluster_compute: 1gpu_16cpus.yaml
  3214. run:
  3215. timeout: 18000
  3216. script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=tf
  3217. alert: default
  3218. variations:
  3219. - __suffix__: aws
  3220. - __suffix__: gce
  3221. env: gce
  3222. frequency: manual
  3223. cluster:
  3224. cluster_env: app_config.yaml
  3225. cluster_compute: 1gpu_16cpus_gce.yaml
  3226. - name: rllib_learning_tests_slateq_torch
  3227. group: RLlib tests
  3228. working_dir: rllib_tests
  3229. # Marking as unstable since it's currently expected to fail.
  3230. stable: false
  3231. frequency: nightly
  3232. team: rllib
  3233. cluster:
  3234. cluster_env: app_config.yaml
  3235. cluster_compute: 1gpu_16cpus.yaml
  3236. run:
  3237. timeout: 18000
  3238. script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=torch
  3239. alert: default
  3240. variations:
  3241. - __suffix__: aws
  3242. - __suffix__: gce
  3243. env: gce
  3244. frequency: manual
  3245. cluster:
  3246. cluster_env: app_config.yaml
  3247. cluster_compute: 1gpu_16cpus_gce.yaml
  3248. - name: rllib_learning_tests_td3_tf
  3249. group: RLlib tests
  3250. working_dir: rllib_tests
  3251. frequency: nightly
  3252. team: rllib
  3253. cluster:
  3254. cluster_env: app_config.yaml
  3255. cluster_compute: 1gpu_16cpus.yaml
  3256. run:
  3257. timeout: 18000
  3258. script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=tf
  3259. alert: default
  3260. variations:
  3261. - __suffix__: aws
  3262. - __suffix__: gce
  3263. env: gce
  3264. frequency: manual
  3265. cluster:
  3266. cluster_env: app_config.yaml
  3267. cluster_compute: 1gpu_16cpus_gce.yaml
  3268. - name: rllib_learning_tests_td3_torch
  3269. group: RLlib tests
  3270. working_dir: rllib_tests
  3271. frequency: nightly
  3272. team: rllib
  3273. cluster:
  3274. cluster_env: app_config.yaml
  3275. cluster_compute: 1gpu_16cpus.yaml
  3276. run:
  3277. timeout: 18000
  3278. script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=torch
  3279. alert: default
  3280. variations:
  3281. - __suffix__: aws
  3282. - __suffix__: gce
  3283. env: gce
  3284. frequency: manual
  3285. cluster:
  3286. cluster_env: app_config.yaml
  3287. cluster_compute: 1gpu_16cpus_gce.yaml
  3288. - name: rllib_multi_gpu_learning_tests
  3289. group: RLlib tests
  3290. working_dir: rllib_tests
  3291. frequency: nightly
  3292. team: rllib
  3293. cluster:
  3294. cluster_env: app_config.yaml
  3295. cluster_compute: 8gpus_96cpus.yaml
  3296. run:
  3297. timeout: 7200
  3298. script: python multi_gpu_learning_tests/run.py
  3299. alert: default
  3300. variations:
  3301. - __suffix__: aws
  3302. - __suffix__: gce
  3303. env: gce
  3304. frequency: manual
  3305. cluster:
  3306. cluster_env: app_config.yaml
  3307. cluster_compute: 8gpus_96cpus_gce.yaml
  3308. - name: rllib_multi_gpu_with_lstm_learning_tests
  3309. group: RLlib tests
  3310. working_dir: rllib_tests
  3311. frequency: nightly
  3312. team: rllib
  3313. cluster:
  3314. cluster_env: app_config.yaml
  3315. cluster_compute: 8gpus_96cpus.yaml
  3316. run:
  3317. timeout: 7200
  3318. script: python multi_gpu_with_lstm_learning_tests/run.py
  3319. alert: default
  3320. variations:
  3321. - __suffix__: aws
  3322. - __suffix__: gce
  3323. env: gce
  3324. frequency: manual
  3325. cluster:
  3326. cluster_env: app_config.yaml
  3327. cluster_compute: 8gpus_96cpus_gce.yaml
  3328. - name: rllib_multi_gpu_with_attention_learning_tests
  3329. group: RLlib tests
  3330. working_dir: rllib_tests
  3331. frequency: nightly
  3332. team: rllib
  3333. cluster:
  3334. cluster_env: app_config.yaml
  3335. cluster_compute: 8gpus_96cpus.yaml
  3336. run:
  3337. timeout: 7200
  3338. script: python multi_gpu_with_attention_learning_tests/run.py
  3339. alert: default
  3340. variations:
  3341. - __suffix__: aws
  3342. - __suffix__: gce
  3343. env: gce
  3344. frequency: manual
  3345. cluster:
  3346. # TODO(https://github.com/ray-project/ray/issues/34591)
  3347. # Revert to the comment below once ^ closed.
  3348. # cluster_env: app_config.yaml
  3349. cluster_env: debug_app_config.yaml
  3350. cluster_compute: 8gpus_96cpus_gce.yaml
  3351. - name: rllib_stress_tests
  3352. group: RLlib tests
  3353. working_dir: rllib_tests
  3354. frequency: weekly
  3355. team: rllib
  3356. cluster:
  3357. cluster_env: app_config.yaml
  3358. cluster_compute: 4gpus_544_cpus.yaml
  3359. run:
  3360. timeout: 5400
  3361. script: python stress_tests/run_stress_tests.py
  3362. wait_for_nodes:
  3363. num_nodes: 6
  3364. smoke_test:
  3365. frequency: nightly
  3366. run:
  3367. timeout: 2000
  3368. alert: default
  3369. variations:
  3370. - __suffix__: aws
  3371. - __suffix__: gce
  3372. env: gce
  3373. frequency: manual
  3374. smoke_test:
  3375. frequency: manual
  3376. run:
  3377. timeout: 2000
  3378. cluster:
  3379. cluster_env: app_config.yaml
  3380. cluster_compute: 4gpus_512_cpus_gce.yaml
  3381. ########################
  3382. # Core Nightly Tests
  3383. ########################
  3384. - name: shuffle_100gb
  3385. group: core-multi-test
  3386. working_dir: nightly_tests
  3387. frequency: nightly
  3388. team: core
  3389. cluster:
  3390. cluster_env: shuffle/shuffle_app_config.yaml
  3391. cluster_compute: shuffle/shuffle_compute_multi.yaml
  3392. run:
  3393. timeout: 3000
  3394. script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
  3395. wait_for_nodes:
  3396. num_nodes: 4
  3397. variations:
  3398. - __suffix__: aws
  3399. - __suffix__: gce
  3400. env: gce
  3401. frequency: manual
  3402. cluster:
  3403. cluster_env: shuffle/shuffle_app_config.yaml
  3404. cluster_compute: shuffle/shuffle_compute_multi_gce.yaml
  3405. - name: stress_test_placement_group
  3406. group: core-multi-test
  3407. working_dir: nightly_tests
  3408. frequency: nightly
  3409. team: core
  3410. cluster:
  3411. cluster_env: stress_tests/stress_tests_app_config.yaml
  3412. cluster_compute: stress_tests/placement_group_tests_compute.yaml
  3413. run:
  3414. timeout: 7200
  3415. script: python stress_tests/test_placement_group.py
  3416. variations:
  3417. - __suffix__: aws
  3418. - __suffix__: gce
  3419. env: gce
  3420. frequency: manual
  3421. cluster:
  3422. cluster_env: stress_tests/stress_tests_app_config.yaml
  3423. cluster_compute: stress_tests/placement_group_tests_compute_gce.yaml
  3424. - name: decision_tree_autoscaling_20_runs
  3425. group: core-multi-test
  3426. working_dir: nightly_tests
  3427. frequency: nightly
  3428. team: core
  3429. cluster:
  3430. cluster_env: decision_tree/decision_tree_app_config.yaml
  3431. cluster_compute: decision_tree/autoscaling_compute.yaml
  3432. run:
  3433. timeout: 9600
  3434. script: python decision_tree/cart_with_tree.py --concurrency=20
  3435. variations:
  3436. - __suffix__: aws
  3437. - __suffix__: gce
  3438. env: gce
  3439. frequency: manual
  3440. cluster:
  3441. cluster_env: decision_tree/decision_tree_app_config.yaml
  3442. cluster_compute: decision_tree/autoscaling_compute_gce.yaml
  3443. - name: autoscaling_shuffle_1tb_1000_partitions
  3444. group: core-multi-test
  3445. working_dir: nightly_tests
  3446. frequency: nightly
  3447. team: core
  3448. cluster:
  3449. cluster_env: shuffle/shuffle_app_config.yaml
  3450. cluster_compute: shuffle/shuffle_compute_autoscaling.yaml
  3451. run:
  3452. timeout: 4000
  3453. script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
  3454. --no-streaming
  3455. variations:
  3456. - __suffix__: aws
  3457. - __suffix__: gce
  3458. env: gce
  3459. frequency: manual
  3460. cluster:
  3461. cluster_env: shuffle/shuffle_app_config.yaml
  3462. cluster_compute: shuffle/shuffle_compute_autoscaling_gce.yaml
  3463. - name: microbenchmark
  3464. group: core-daily-test
  3465. team: core
  3466. frequency: nightly
  3467. working_dir: microbenchmark
  3468. python: "3.7"
  3469. cluster:
  3470. cluster_env: app_config.yaml
  3471. cluster_compute: tpl_64.yaml
  3472. run:
  3473. timeout: 1800
  3474. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
  3475. variations:
  3476. - __suffix__: aws
  3477. - __suffix__: gce
  3478. env: gce
  3479. frequency: manual
  3480. cluster:
  3481. cluster_env: app_config.yaml
  3482. cluster_compute: tpl_64_gce.yaml
  3483. - name: microbenchmark_38
  3484. group: core-daily-test
  3485. team: core
  3486. frequency: nightly
  3487. working_dir: microbenchmark
  3488. python: "3.8"
  3489. cluster:
  3490. cluster_env: app_config.yaml
  3491. cluster_compute: tpl_64.yaml
  3492. run:
  3493. timeout: 1800
  3494. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
  3495. variations:
  3496. - __suffix__: aws
  3497. - __suffix__: gce
  3498. env: gce
  3499. frequency: manual
  3500. cluster:
  3501. cluster_env: app_config.yaml
  3502. cluster_compute: tpl_64_gce.yaml
  3503. - name: benchmark_worker_startup
  3504. group: core-daily-test
  3505. team: core
  3506. frequency: nightly
  3507. working_dir: benchmark-worker-startup
  3508. stable: false
  3509. python: "3.9"
  3510. cluster:
  3511. cluster_env: app_config_gpu.yaml
  3512. cluster_compute: only_head_node_1gpu_64cpu.yaml
  3513. run:
  3514. timeout: 7200
  3515. script: python benchmark_worker_startup.py
  3516. --num_cpus_in_cluster 64
  3517. --num_gpus_in_cluster 64
  3518. --num_tasks_or_actors_per_run 64
  3519. --num_measurements_per_configuration 5
  3520. variations:
  3521. - __suffix__: aws
  3522. - __suffix__: gce
  3523. env: gce
  3524. frequency: manual
  3525. cluster:
  3526. cluster_env: app_config_gpu.yaml
  3527. cluster_compute: only_head_node_1gpu_64cpu_gce.yaml
  3528. - name: dask_on_ray_100gb_sort
  3529. group: core-daily-test
  3530. working_dir: nightly_tests
  3531. frequency: nightly
  3532. team: core
  3533. cluster:
  3534. cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
  3535. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml
  3536. run:
  3537. timeout: 7200
  3538. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions
  3539. 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
  3540. variations:
  3541. - __suffix__: aws
  3542. - __suffix__: gce
  3543. env: gce
  3544. frequency: manual
  3545. cluster:
  3546. cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
  3547. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template_gce.yaml
  3548. - name: dask_on_ray_large_scale_test_spilling
  3549. group: core-daily-test
  3550. working_dir: nightly_tests
  3551. frequency: nightly
  3552. team: data
  3553. cluster:
  3554. cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
  3555. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  3556. run:
  3557. timeout: 7200
  3558. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  3559. 70 --error_rate 0 --data_save_path /tmp/ray
  3560. wait_for_nodes:
  3561. num_nodes: 21
  3562. smoke_test:
  3563. frequency: nightly
  3564. cluster:
  3565. app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
  3566. cluster_compute: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
  3567. run:
  3568. timeout: 7200
  3569. script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb
  3570. 70 --error_rate 0 --data_save_path /tmp/ray
  3571. wait_for_nodes:
  3572. num_nodes: 5
  3573. - name: stress_test_state_api_scale
  3574. group: core-daily-test
  3575. working_dir: nightly_tests
  3576. frequency: nightly
  3577. team: core
  3578. cluster:
  3579. cluster_env: stress_tests/state_api_app_config.yaml
  3580. cluster_compute: stress_tests/stress_tests_compute_large.yaml
  3581. run:
  3582. timeout: 3600
  3583. script: python stress_tests/test_state_api_scale.py
  3584. wait_for_nodes:
  3585. num_nodes: 7
  3586. smoke_test:
  3587. frequency: nightly
  3588. cluster:
  3589. app_config: stress_tests/state_api_app_config.yaml
  3590. cluster_compute: stress_tests/smoke_test_compute.yaml
  3591. run:
  3592. timeout: 3600
  3593. wait_for_nodes:
  3594. num_nodes: 5
  3595. script: python stress_tests/test_state_api_scale.py --smoke-test
  3596. variations:
  3597. - __suffix__: aws
  3598. - __suffix__: gce
  3599. env: gce
  3600. frequency: manual
  3601. cluster:
  3602. cluster_env: stress_tests/state_api_app_config.yaml
  3603. cluster_compute: stress_tests/stress_tests_compute_large_gce.yaml
  3604. smoke_test:
  3605. frequency: manual
  3606. - name: shuffle_20gb_with_state_api
  3607. group: core-daily-test
  3608. working_dir: nightly_tests
  3609. frequency: nightly
  3610. team: core
  3611. cluster:
  3612. cluster_env: shuffle/shuffle_with_state_api_app_config.yaml
  3613. cluster_compute: shuffle/shuffle_compute_single.yaml
  3614. run:
  3615. timeout: 1000
  3616. script: python stress_tests/test_state_api_with_other_tests.py
  3617. nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
  3618. variations:
  3619. - __suffix__: aws
  3620. - __suffix__: gce
  3621. env: gce
  3622. frequency: manual
  3623. cluster:
  3624. cluster_env: shuffle/shuffle_with_state_api_app_config.yaml
  3625. cluster_compute: shuffle/shuffle_compute_single_gce.yaml
  3626. - name: stress_test_many_tasks
  3627. group: core-daily-test
  3628. working_dir: nightly_tests
  3629. frequency: nightly
  3630. team: core
  3631. cluster:
  3632. cluster_env: stress_tests/stress_tests_app_config.yaml
  3633. cluster_compute: stress_tests/stress_tests_compute.yaml
  3634. run:
  3635. timeout: 14400
  3636. wait_for_nodes:
  3637. num_nodes: 101
  3638. script: python stress_tests/test_many_tasks.py
  3639. smoke_test:
  3640. frequency: nightly
  3641. cluster:
  3642. app_config: stress_tests/stress_tests_app_config.yaml
  3643. cluster_compute: stress_tests/smoke_test_compute.yaml
  3644. run:
  3645. timeout: 3600
  3646. wait_for_nodes:
  3647. num_nodes: 5
  3648. script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
  3649. variations:
  3650. - __suffix__: aws
  3651. - __suffix__: gce
  3652. env: gce
  3653. frequency: manual
  3654. cluster:
  3655. cluster_env: stress_tests/stress_tests_app_config.yaml
  3656. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  3657. smoke_test:
  3658. frequency: manual
  3659. - name: stress_test_dead_actors
  3660. group: core-daily-test
  3661. working_dir: nightly_tests
  3662. frequency: nightly
  3663. team: core
  3664. cluster:
  3665. cluster_env: stress_tests/stress_tests_app_config.yaml
  3666. cluster_compute: stress_tests/stress_tests_compute.yaml
  3667. run:
  3668. timeout: 7200
  3669. wait_for_nodes:
  3670. num_nodes: 101
  3671. script: python stress_tests/test_dead_actors.py
  3672. smoke_test:
  3673. frequency: nightly
  3674. cluster:
  3675. app_config: stress_tests/stress_tests_app_config.yaml
  3676. cluster_compute: stress_tests/smoke_test_compute.yaml
  3677. run:
  3678. timeout: 3600
  3679. wait_for_nodes:
  3680. num_nodes: 5
  3681. script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3
  3682. --num-children=3
  3683. variations:
  3684. - __suffix__: aws
  3685. - __suffix__: gce
  3686. env: gce
  3687. frequency: manual
  3688. cluster:
  3689. cluster_env: stress_tests/stress_tests_app_config.yaml
  3690. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  3691. smoke_test:
  3692. frequency: manual
  3693. # The full test is not stable, so run the smoke test only.
  3694. # See https://github.com/ray-project/ray/issues/23244.
  3695. - name: threaded_actors_stress_test
  3696. group: core-daily-test
  3697. working_dir: nightly_tests
  3698. frequency: nightly
  3699. team: core
  3700. cluster:
  3701. cluster_env: stress_tests/stress_tests_app_config.yaml
  3702. cluster_compute: stress_tests/smoke_test_compute.yaml
  3703. run:
  3704. timeout: 3600
  3705. script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  3706. 30
  3707. wait_for_nodes:
  3708. num_nodes: 5
  3709. variations:
  3710. - __suffix__: aws
  3711. - __suffix__: gce
  3712. env: gce
  3713. frequency: manual
  3714. cluster:
  3715. cluster_env: stress_tests/stress_tests_app_config.yaml
  3716. cluster_compute: stress_tests/smoke_test_compute_gce.yaml
  3717. # - name: threaded_actors_stress_test
  3718. # group: core-daily-test
  3719. # working_dir: nightly_tests
  3720. #
  3721. # frequency: nightly
  3722. # team: core
  3723. # cluster:
  3724. # cluster_env: stress_tests/stress_tests_app_config.yaml
  3725. # cluster_compute: stress_tests/stress_test_threaded_actor_compute.yaml
  3726. #
  3727. # run:
  3728. # timeout: 7200
  3729. # script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s
  3730. # 60
  3731. #
  3732. # wait_for_nodes:
  3733. # num_nodes: 201
  3734. # timeout: 600
  3735. #
  3736. # smoke_test:
  3737. # frequency: nightly
  3738. # cluster:
  3739. # app_config: stress_tests/stress_tests_app_config.yaml
  3740. # cluster_compute: stress_tests/smoke_test_compute.yaml
  3741. #
  3742. # run:
  3743. # timeout: 3600
  3744. # script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  3745. # 30
  3746. #
  3747. # wait_for_nodes:
  3748. # num_nodes: 5
  3749. # timeout: 600
  3750. - name: single_node_oom
  3751. group: core-daily-test
  3752. working_dir: nightly_tests
  3753. frequency: nightly
  3754. team: core
  3755. cluster:
  3756. cluster_env: stress_tests/stress_tests_single_node_oom_app_config.yaml
  3757. cluster_compute: stress_tests/stress_tests_single_node_oom_compute.yaml
  3758. run:
  3759. timeout: 500
  3760. script: python stress_tests/test_parallel_tasks_memory_pressure.py --num-tasks 20
  3761. variations:
  3762. - __suffix__: aws
  3763. - __suffix__: gce
  3764. env: gce
  3765. frequency: manual
  3766. cluster:
  3767. cluster_env: stress_tests/stress_tests_single_node_oom_app_config.yaml
  3768. cluster_compute: stress_tests/stress_tests_single_node_oom_compute_gce.yaml
  3769. - name: tune_air_oom
  3770. group: core-daily-test
  3771. working_dir: air_tests
  3772. stable: false
  3773. jailed: true
  3774. frequency: nightly
  3775. team: core
  3776. cluster:
  3777. cluster_env: oom/stress_tests_tune_air_oom_app_config.yaml
  3778. cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml
  3779. run:
  3780. timeout: 3600
  3781. script: bash oom/tune_air_oom.sh
  3782. - name: dask_on_ray_1tb_sort
  3783. group: core-daily-test
  3784. working_dir: nightly_tests
  3785. frequency: nightly-3x
  3786. team: core
  3787. cluster:
  3788. cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
  3789. cluster_compute: dask_on_ray/1tb_sort_compute.yaml
  3790. run:
  3791. timeout: 7200
  3792. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions
  3793. 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
  3794. wait_for_nodes:
  3795. num_nodes: 32
  3796. - name: many_nodes_actor_test_on_v2
  3797. group: core-daily-test
  3798. working_dir: benchmarks
  3799. frequency: nightly-3x
  3800. team: core
  3801. cluster:
  3802. cluster_env: distributed/many_nodes_tests/app_config.yaml
  3803. cluster_compute: distributed/many_nodes_tests/compute_config.yaml
  3804. run:
  3805. timeout: 3600
  3806. # 2cpus per node x 1000 nodes / 0.2 cpus per actor = 10k
  3807. # 2cpus per node x 2000 nodes / 0.2 cpus per actor = 20k
  3808. script: python distributed/many_nodes_tests/actor_test.py --no-wait --cpus-per-actor=0.2 --total-actors 10000 20000
  3809. wait_for_nodes:
  3810. num_nodes: 500
  3811. variations:
  3812. - __suffix__: aws
  3813. - __suffix__: gce
  3814. env: gce
  3815. frequency: manual
  3816. cluster:
  3817. cluster_env: distributed/many_nodes_tests/app_config.yaml
  3818. cluster_compute: distributed/many_nodes_tests/compute_config_gce.yaml
  3819. #- name: many_nodes_multi_master_test
  3820. # group: core-daily-test
  3821. # working_dir: nightly_tests
  3822. #
  3823. # frequency: nightly-3x
  3824. # team: core
  3825. # cluster:
  3826. # cluster_env: many_nodes_tests/app_config.yaml
  3827. # cluster_compute: many_nodes_tests/compute_config.yaml
  3828. #
  3829. # run:
  3830. # timeout: 7200
  3831. # script: python many_nodes_tests/multi_master_test.py
  3832. # wait_for_nodes:
  3833. # num_nodes: 251
  3834. #
  3835. # type: anyscale_job
  3836. # file_manager: sdk
  3837. - name: pg_autoscaling_regression_test
  3838. group: core-daily-test
  3839. working_dir: nightly_tests
  3840. frequency: nightly
  3841. team: core
  3842. cluster:
  3843. cluster_env: placement_group_tests/app_config.yaml
  3844. cluster_compute: placement_group_tests/compute.yaml
  3845. run:
  3846. timeout: 1200
  3847. script: python placement_group_tests/pg_run.py
  3848. variations:
  3849. - __suffix__: aws
  3850. - __suffix__: gce
  3851. env: gce
  3852. frequency: manual
  3853. cluster:
  3854. cluster_env: placement_group_tests/app_config.yaml
  3855. cluster_compute: placement_group_tests/compute_gce.yaml
  3856. - name: placement_group_performance_test
  3857. group: core-daily-test
  3858. working_dir: nightly_tests
  3859. frequency: nightly
  3860. team: core
  3861. cluster:
  3862. cluster_env: placement_group_tests/app_config.yaml
  3863. cluster_compute: placement_group_tests/pg_perf_test_compute.yaml
  3864. run:
  3865. timeout: 1200
  3866. script: python placement_group_tests/placement_group_performance_test.py
  3867. wait_for_nodes:
  3868. num_nodes: 5
  3869. variations:
  3870. - __suffix__: aws
  3871. - __suffix__: gce
  3872. env: gce
  3873. frequency: manual
  3874. cluster:
  3875. cluster_env: placement_group_tests/app_config.yaml
  3876. cluster_compute: placement_group_tests/pg_perf_test_compute_gce.yaml
  3877. #########################
  3878. # Core Scalability Tests
  3879. #########################
  3880. - name: single_node
  3881. group: core-scalability-test
  3882. working_dir: benchmarks
  3883. frequency: nightly
  3884. team: core
  3885. cluster:
  3886. cluster_env: app_config.yaml
  3887. cluster_compute: single_node.yaml
  3888. run:
  3889. timeout: 12000
  3890. prepare: sleep 0
  3891. script: python single_node/test_single_node.py
  3892. variations:
  3893. - __suffix__: aws
  3894. - __suffix__: gce
  3895. env: gce
  3896. frequency: manual
  3897. cluster:
  3898. cluster_env: app_config.yaml
  3899. cluster_compute: single_node_gce.yaml
  3900. - name: object_store
  3901. group: core-scalability-test
  3902. working_dir: benchmarks
  3903. frequency: nightly
  3904. team: core
  3905. cluster:
  3906. cluster_env: app_config.yaml
  3907. cluster_compute: object_store.yaml
  3908. run:
  3909. timeout: 3600
  3910. script: python object_store/test_object_store.py
  3911. wait_for_nodes:
  3912. num_nodes: 50
  3913. variations:
  3914. - __suffix__: aws
  3915. - __suffix__: gce
  3916. env: gce
  3917. frequency: manual
  3918. cluster:
  3919. cluster_env: app_config.yaml
  3920. cluster_compute: object_store_gce.yaml
  3921. - name: many_actors
  3922. group: core-scalability-test
  3923. working_dir: benchmarks
  3924. frequency: nightly-3x
  3925. team: core
  3926. cluster:
  3927. cluster_env: app_config.yaml
  3928. cluster_compute: distributed.yaml
  3929. run:
  3930. timeout: 3600
  3931. script: python distributed/test_many_actors.py
  3932. wait_for_nodes:
  3933. num_nodes: 65
  3934. variations:
  3935. - __suffix__: aws
  3936. - __suffix__: gce
  3937. env: gce
  3938. frequency: manual
  3939. cluster:
  3940. cluster_env: app_config.yaml
  3941. cluster_compute: distributed_gce.yaml
  3942. - name: many_actors_smoke_test
  3943. group: core-scalability-test
  3944. working_dir: benchmarks
  3945. frequency: nightly
  3946. team: core
  3947. cluster:
  3948. cluster_env: app_config.yaml
  3949. cluster_compute: distributed_smoke_test.yaml
  3950. run:
  3951. timeout: 3600
  3952. script: SMOKE_TEST=1 python distributed/test_many_actors.py
  3953. wait_for_nodes:
  3954. num_nodes: 2
  3955. - name: many_tasks
  3956. group: core-scalability-test
  3957. working_dir: benchmarks
  3958. frequency: nightly
  3959. team: core
  3960. cluster:
  3961. cluster_env: app_config.yaml
  3962. cluster_compute: distributed.yaml
  3963. run:
  3964. timeout: 3600
  3965. script: python distributed/test_many_tasks.py --num-tasks=10000
  3966. wait_for_nodes:
  3967. num_nodes: 65
  3968. variations:
  3969. - __suffix__: aws
  3970. - __suffix__: gce
  3971. env: gce
  3972. frequency: manual
  3973. cluster:
  3974. cluster_env: app_config.yaml
  3975. cluster_compute: distributed_gce.yaml
  3976. - name: many_pgs
  3977. group: core-scalability-test
  3978. working_dir: benchmarks
  3979. frequency: nightly-3x
  3980. team: core
  3981. cluster:
  3982. cluster_env: app_config.yaml
  3983. cluster_compute: distributed.yaml
  3984. run:
  3985. timeout: 3600
  3986. script: python distributed/test_many_pgs.py
  3987. wait_for_nodes:
  3988. num_nodes: 65
  3989. variations:
  3990. - __suffix__: aws
  3991. - __suffix__: gce
  3992. env: gce
  3993. frequency: manual
  3994. cluster:
  3995. cluster_env: app_config.yaml
  3996. cluster_compute: distributed_gce.yaml
  3997. - name: many_pgs_smoke_test
  3998. group: core-scalability-test
  3999. working_dir: benchmarks
  4000. frequency: nightly
  4001. team: core
  4002. cluster:
  4003. cluster_env: app_config.yaml
  4004. cluster_compute: distributed_smoke_test.yaml
  4005. run:
  4006. timeout: 3600
  4007. script: SMOKE_TEST=1 python distributed/test_many_pgs.py
  4008. wait_for_nodes:
  4009. num_nodes: 2
  4010. - name: many_nodes
  4011. group: core-scalability-test
  4012. working_dir: benchmarks
  4013. frequency: nightly-3x
  4014. team: core
  4015. cluster:
  4016. cluster_env: app_config.yaml
  4017. cluster_compute: many_nodes.yaml
  4018. run:
  4019. timeout: 3600
  4020. script: python distributed/test_many_tasks.py --num-tasks=1000
  4021. wait_for_nodes:
  4022. num_nodes: 250
  4023. variations:
  4024. - __suffix__: aws
  4025. - __suffix__: gce
  4026. env: gce
  4027. frequency: manual
  4028. cluster:
  4029. cluster_env: app_config.yaml
  4030. cluster_compute: many_nodes_gce.yaml
  4031. - name: scheduling_test_many_0s_tasks_many_nodes
  4032. group: core-scalability-test
  4033. working_dir: benchmarks
  4034. frequency: nightly
  4035. team: core
  4036. cluster:
  4037. cluster_env: app_config.yaml
  4038. cluster_compute: scheduling.yaml
  4039. run:
  4040. timeout: 3600
  4041. script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  4042. --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
  4043. wait_for_nodes:
  4044. num_nodes: 32
  4045. variations:
  4046. - __suffix__: aws
  4047. - __suffix__: gce
  4048. env: gce
  4049. frequency: manual
  4050. cluster:
  4051. cluster_env: app_config.yaml
  4052. cluster_compute: scheduling_gce.yaml
  4053. # - name: scheduling_test_many_5s_tasks_single_node
  4054. # group: core-scalability-test
  4055. # working_dir: benchmarks
  4056. # frequency: nightly
  4057. # team: core
  4058. # cluster:
  4059. # cluster_env: app_config.yaml
  4060. # cluster_compute: scheduling.yaml
  4061. # run:
  4062. # timeout: 3600
  4063. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  4064. # --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
  4065. # wait_for_nodes:
  4066. # num_nodes: 32
  4067. # timeout: 600
  4068. # stable: false
  4069. # - name: scheduling_test_many_5s_tasks_many_nodes
  4070. # group: core-scalability-test
  4071. # working_dir: benchmarks
  4072. # frequency: nightly
  4073. # team: core
  4074. # cluster:
  4075. # cluster_env: app_config.yaml
  4076. # cluster_compute: scheduling.yaml
  4077. # run:
  4078. # timeout: 3600
  4079. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  4080. # --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
  4081. # wait_for_nodes:
  4082. # num_nodes: 32
  4083. # timeout: 600
  4084. # stable: false
  4085. ###############
  4086. # Dataset tests
  4087. ###############
  4088. - name: inference
  4089. group: data-tests
  4090. working_dir: nightly_tests/dataset
  4091. frequency: nightly
  4092. team: data
  4093. cluster:
  4094. cluster_env: app_config.yaml
  4095. cluster_compute: inference.yaml
  4096. run:
  4097. timeout: 600
  4098. script: python inference.py
  4099. wait_for_nodes:
  4100. num_nodes: 2
  4101. variations:
  4102. - __suffix__: aws
  4103. - __suffix__: gce
  4104. env: gce
  4105. frequency: manual
  4106. cluster:
  4107. cluster_env: app_config.yaml
  4108. cluster_compute: inference_gce.yaml
  4109. - name: shuffle_data_loader
  4110. group: data-tests
  4111. working_dir: nightly_tests/dataset
  4112. frequency: nightly
  4113. team: data
  4114. cluster:
  4115. cluster_env: shuffle_app_config.yaml
  4116. cluster_compute: shuffle_compute.yaml
  4117. run:
  4118. timeout: 1800
  4119. script: python dataset_shuffle_data_loader.py --cloud aws
  4120. variations:
  4121. - __suffix__: aws
  4122. - __suffix__: gce
  4123. env: gce
  4124. frequency: manual
  4125. cluster:
  4126. cluster_compute: shuffle_compute_gce.yaml
  4127. run:
  4128. script: python dataset_shuffle_data_loader.py --cloud gcp
  4129. - name: parquet_metadata_resolution
  4130. group: data-tests
  4131. working_dir: nightly_tests/dataset
  4132. frequency: nightly
  4133. team: data
  4134. cluster:
  4135. cluster_env: app_config.yaml
  4136. cluster_compute: single_node_benchmark_compute.yaml
  4137. run:
  4138. # Expect the test to finish around 40 seconds.
  4139. timeout: 100
  4140. script: python parquet_metadata_resolution.py --num-files 915 --cloud aws
  4141. variations:
  4142. - __suffix__: aws
  4143. - __suffix__: gce
  4144. env: gce
  4145. frequency: manual
  4146. cluster:
  4147. cluster_compute: single_node_benchmark_compute_gce.yaml
  4148. run:
  4149. script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp
  4150. - name: dataset_random_access
  4151. group: data-tests
  4152. working_dir: nightly_tests/dataset
  4153. stable: false
  4154. frequency: nightly
  4155. team: data
  4156. cluster:
  4157. cluster_env: pipelined_training_app.yaml
  4158. cluster_compute: pipelined_training_compute.yaml
  4159. run:
  4160. timeout: 1200
  4161. script: python dataset_random_access.py
  4162. wait_for_nodes:
  4163. num_nodes: 15
  4164. variations:
  4165. - __suffix__: aws
  4166. - __suffix__: gce
  4167. env: gce
  4168. frequency: manual
  4169. cluster:
  4170. cluster_env: pipelined_training_app.yaml
  4171. cluster_compute: pipelined_training_compute_gce.yaml
  4172. - name: pipelined_data_ingest_benchmark_1tb
  4173. group: data-tests
  4174. working_dir: nightly_tests/dataset
  4175. frequency: nightly
  4176. team: data
  4177. cluster:
  4178. cluster_env: app_config.yaml
  4179. cluster_compute: data_ingest_benchmark_compute.yaml
  4180. run:
  4181. timeout: 300
  4182. script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming
  4183. wait_for_nodes:
  4184. num_nodes: 20
  4185. variations:
  4186. - __suffix__: aws
  4187. - __suffix__: gce
  4188. env: gce
  4189. frequency: manual
  4190. cluster:
  4191. cluster_env: app_config.yaml
  4192. cluster_compute: data_ingest_benchmark_compute_gce.yaml
  4193. - name: streaming_data_ingest_benchmark_1tb
  4194. group: data-tests
  4195. working_dir: nightly_tests/dataset
  4196. frequency: nightly
  4197. team: data
  4198. cluster:
  4199. cluster_env: app_config.yaml
  4200. cluster_compute: data_ingest_benchmark_compute.yaml
  4201. run:
  4202. timeout: 300
  4203. script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --new_streaming
  4204. wait_for_nodes:
  4205. num_nodes: 20
  4206. variations:
  4207. - __suffix__: aws
  4208. - __suffix__: gce
  4209. env: gce
  4210. frequency: manual
  4211. cluster:
  4212. cluster_env: app_config.yaml
  4213. cluster_compute: data_ingest_benchmark_compute_gce.yaml
  4214. - name: aggregate_benchmark
  4215. group: data-tests
  4216. working_dir: nightly_tests/dataset
  4217. frequency: nightly
  4218. team: data
  4219. cluster:
  4220. cluster_env: app_config.yaml
  4221. cluster_compute: single_node_benchmark_compute.yaml
  4222. run:
  4223. timeout: 1800
  4224. script: python aggregate_benchmark.py
  4225. variations:
  4226. - __suffix__: aws
  4227. - __suffix__: gce
  4228. env: gce
  4229. frequency: manual
  4230. cluster:
  4231. cluster_env: app_config.yaml
  4232. cluster_compute: single_node_benchmark_compute_gce.yaml
  4233. - name: read_parquet_benchmark_single_node
  4234. group: data-tests
  4235. working_dir: nightly_tests/dataset
  4236. frequency: nightly
  4237. team: data
  4238. cluster:
  4239. cluster_env: app_config.yaml
  4240. cluster_compute: single_node_benchmark_compute.yaml
  4241. run:
  4242. # Expect the benchmark to finish in 400 seconds.
  4243. timeout: 400
  4244. script: python read_parquet_benchmark.py
  4245. variations:
  4246. - __suffix__: aws
  4247. - __suffix__: gce
  4248. env: gce
  4249. frequency: manual
  4250. cluster:
  4251. cluster_env: app_config.yaml
  4252. cluster_compute: single_node_benchmark_compute_gce.yaml
  4253. - name: read_images_benchmark_single_node
  4254. group: data-tests
  4255. working_dir: nightly_tests/dataset
  4256. frequency: nightly
  4257. team: data
  4258. cluster:
  4259. cluster_env: app_config.yaml
  4260. cluster_compute: single_node_benchmark_compute.yaml
  4261. run:
  4262. timeout: 1800
  4263. script: python read_images_benchmark.py
  4264. variations:
  4265. - __suffix__: aws
  4266. - __suffix__: gce
  4267. env: gce
  4268. frequency: manual
  4269. cluster:
  4270. cluster_env: app_config.yaml
  4271. cluster_compute: single_node_benchmark_compute_gce.yaml
  4272. - name: read_tfrecords_benchmark_single_node
  4273. group: data-tests
  4274. working_dir: nightly_tests/dataset
  4275. frequency: nightly
  4276. team: data
  4277. cluster:
  4278. cluster_env: read_tfrecords_benchmark_app.yaml
  4279. cluster_compute: single_node_benchmark_compute.yaml
  4280. run:
  4281. # Expect the benchmark to finish around 22 minutes.
  4282. timeout: 1800
  4283. script: python read_tfrecords_benchmark.py
  4284. variations:
  4285. - __suffix__: aws
  4286. - __suffix__: gce
  4287. env: gce
  4288. frequency: manual
  4289. cluster:
  4290. cluster_env: read_tfrecords_benchmark_app.yaml
  4291. cluster_compute: single_node_benchmark_compute_gce.yaml
  4292. - name: map_batches_benchmark_single_node
  4293. group: data-tests
  4294. working_dir: nightly_tests/dataset
  4295. frequency: nightly
  4296. team: data
  4297. cluster:
  4298. cluster_env: app_config.yaml
  4299. cluster_compute: single_node_benchmark_compute.yaml
  4300. run:
  4301. # Expect the benchmark to finish around 30 minutes.
  4302. timeout: 2400
  4303. script: python map_batches_benchmark.py
  4304. variations:
  4305. - __suffix__: aws
  4306. - __suffix__: gce
  4307. env: gce
  4308. frequency: manual
  4309. cluster:
  4310. cluster_env: app_config.yaml
  4311. cluster_compute: single_node_benchmark_compute_gce.yaml
  4312. - name: iter_tensor_batches_benchmark_single_node
  4313. group: data-tests
  4314. working_dir: nightly_tests/dataset
  4315. frequency: nightly
  4316. team: data
  4317. cluster:
  4318. cluster_env: app_config.yaml
  4319. cluster_compute: single_node_benchmark_compute.yaml
  4320. run:
  4321. # Expect the benchmark to finish around 30 minutes.
  4322. timeout: 2400
  4323. script: python iter_tensor_batches_benchmark.py
  4324. variations:
  4325. - __suffix__: aws
  4326. - __suffix__: gce
  4327. env: gce
  4328. frequency: manual
  4329. cluster:
  4330. cluster_env: app_config.yaml
  4331. cluster_compute: single_node_benchmark_compute_gce.yaml
  4332. - name: iter_tensor_batches_benchmark_multi_node
  4333. group: data-tests
  4334. working_dir: nightly_tests/dataset
  4335. frequency: nightly
  4336. team: data
  4337. cluster:
  4338. cluster_env: app_config.yaml
  4339. cluster_compute: multi_node_benchmark_compute.yaml
  4340. run:
  4341. # Expect the benchmark to finish around 30 minutes.
  4342. timeout: 2400
  4343. script: python iter_tensor_batches_benchmark.py --data-size-gb=10
  4344. variations:
  4345. - __suffix__: aws
  4346. - __suffix__: gce
  4347. env: gce
  4348. frequency: manual
  4349. cluster:
  4350. cluster_env: app_config.yaml
  4351. cluster_compute: multi_node_benchmark_compute_gce.yaml
  4352. - name: iter_batches_benchmark_single_node
  4353. group: data-tests
  4354. working_dir: nightly_tests/dataset
  4355. frequency: nightly
  4356. team: data
  4357. cluster:
  4358. cluster_env: app_config.yaml
  4359. cluster_compute: single_node_benchmark_compute.yaml
  4360. run:
  4361. # Expect the benchmark to finish around 12 minutes.
  4362. timeout: 1080
  4363. script: python iter_batches_benchmark.py
  4364. variations:
  4365. - __suffix__: aws
  4366. - __suffix__: gce
  4367. env: gce
  4368. frequency: manual
  4369. cluster:
  4370. cluster_env: app_config.yaml
  4371. cluster_compute: single_node_benchmark_compute_gce.yaml
  4372. - name: pipelined_training_50_gb
  4373. group: data-tests
  4374. working_dir: nightly_tests/dataset
  4375. frequency: nightly
  4376. team: data
  4377. cluster:
  4378. cluster_env: pipelined_training_app.yaml
  4379. cluster_compute: pipelined_training_compute.yaml
  4380. run:
  4381. timeout: 4800
  4382. script: python pipelined_training.py --epochs 1
  4383. wait_for_nodes:
  4384. num_nodes: 15
  4385. variations:
  4386. - __suffix__: aws
  4387. - __suffix__: gce
  4388. env: gce
  4389. frequency: manual
  4390. cluster:
  4391. cluster_env: pipelined_training_app.yaml
  4392. cluster_compute: pipelined_training_compute_gce.yaml
  4393. - name: pipelined_ingestion_1500_gb
  4394. group: data-tests
  4395. working_dir: nightly_tests/dataset
  4396. frequency: nightly
  4397. team: data
  4398. cluster:
  4399. cluster_env: pipelined_ingestion_app.yaml
  4400. cluster_compute: pipelined_ingestion_compute.yaml
  4401. run:
  4402. timeout: 9600
  4403. script: python pipelined_training.py --epochs 2 --num-windows 5 --num-files 915
  4404. --debug
  4405. wait_for_nodes:
  4406. num_nodes: 21
  4407. variations:
  4408. - __suffix__: aws
  4409. - __suffix__: gce
  4410. env: gce
  4411. frequency: manual
  4412. cluster:
  4413. cluster_env: pipelined_training_app.yaml
  4414. cluster_compute: pipelined_training_compute_gce.yaml
  4415. - name: dataset_shuffle_random_shuffle_1tb
  4416. group: data-tests
  4417. working_dir: nightly_tests
  4418. frequency: nightly
  4419. team: data
  4420. cluster:
  4421. cluster_env: shuffle/shuffle_app_config.yaml
  4422. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4423. run:
  4424. timeout: 7200
  4425. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4426. wait_for_nodes:
  4427. num_nodes: 20
  4428. variations:
  4429. - __suffix__: aws
  4430. - __suffix__: gce
  4431. env: gce
  4432. frequency: manual
  4433. cluster:
  4434. cluster_env: shuffle/shuffle_app_config.yaml
  4435. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4436. - name: dataset_shuffle_sort_1tb
  4437. group: data-tests
  4438. working_dir: nightly_tests
  4439. frequency: nightly
  4440. team: data
  4441. cluster:
  4442. cluster_env: shuffle/shuffle_app_config.yaml
  4443. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4444. run:
  4445. timeout: 7200
  4446. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4447. wait_for_nodes:
  4448. num_nodes: 20
  4449. variations:
  4450. - __suffix__: aws
  4451. - __suffix__: gce
  4452. env: gce
  4453. frequency: manual
  4454. cluster:
  4455. cluster_env: shuffle/shuffle_app_config.yaml
  4456. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4457. - name: dataset_shuffle_push_based_random_shuffle_1tb
  4458. group: data-tests
  4459. working_dir: nightly_tests
  4460. frequency: nightly
  4461. team: data
  4462. cluster:
  4463. cluster_env: shuffle/shuffle_app_config.yaml
  4464. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4465. run:
  4466. timeout: 7200
  4467. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4468. wait_for_nodes:
  4469. num_nodes: 20
  4470. variations:
  4471. - __suffix__: aws
  4472. - __suffix__: gce
  4473. env: gce
  4474. frequency: manual
  4475. cluster:
  4476. cluster_env: shuffle/shuffle_app_config.yaml
  4477. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4478. - name: dataset_shuffle_push_based_sort_1tb
  4479. group: data-tests
  4480. working_dir: nightly_tests
  4481. frequency: nightly
  4482. team: data
  4483. cluster:
  4484. cluster_env: shuffle/shuffle_app_config.yaml
  4485. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4486. run:
  4487. timeout: 7200
  4488. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4489. wait_for_nodes:
  4490. num_nodes: 20
  4491. variations:
  4492. - __suffix__: aws
  4493. - __suffix__: gce
  4494. env: gce
  4495. frequency: manual
  4496. cluster:
  4497. cluster_env: shuffle/shuffle_app_config.yaml
  4498. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4499. - name: dataset_shuffle_push_based_random_shuffle_100tb
  4500. group: data-tests
  4501. working_dir: nightly_tests
  4502. frequency: weekly
  4503. team: data
  4504. cluster:
  4505. cluster_env: shuffle/100tb_shuffle_app_config.yaml
  4506. cluster_compute: shuffle/100tb_shuffle_compute.yaml
  4507. run:
  4508. timeout: 28800
  4509. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle
  4510. wait_for_nodes:
  4511. num_nodes: 100
  4512. variations:
  4513. - __suffix__: aws
  4514. - __suffix__: gce
  4515. env: gce
  4516. frequency: manual
  4517. cluster:
  4518. cluster_env: shuffle/100tb_shuffle_app_config_gce.yaml
  4519. cluster_compute: shuffle/100tb_shuffle_compute_gce.yaml
  4520. run:
  4521. timeout: 28800
  4522. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=40000 --partition-size=1e9 --shuffle
  4523. wait_for_nodes:
  4524. num_nodes: 100
  4525. ##################
  4526. # Core Chaos tests
  4527. ##################
  4528. - name: chaos_many_tasks_no_object_store
  4529. group: core-nightly-test
  4530. working_dir: nightly_tests
  4531. frequency: nightly
  4532. team: core
  4533. cluster:
  4534. cluster_env: chaos_test/app_config.yaml
  4535. cluster_compute: chaos_test/compute_template.yaml
  4536. run:
  4537. timeout: 3600
  4538. wait_for_nodes:
  4539. num_nodes: 10
  4540. prepare: python setup_chaos.py --no-start
  4541. script: python chaos_test/test_chaos_basic.py --workload=tasks
  4542. variations:
  4543. - __suffix__: aws
  4544. - __suffix__: gce
  4545. env: gce
  4546. frequency: manual
  4547. cluster:
  4548. cluster_env: chaos_test/app_config.yaml
  4549. cluster_compute: chaos_test/compute_template_gce.yaml
  4550. - name: chaos_many_actors
  4551. group: core-nightly-test
  4552. working_dir: nightly_tests
  4553. frequency: nightly
  4554. team: core
  4555. cluster:
  4556. cluster_env: chaos_test/app_config.yaml
  4557. cluster_compute: chaos_test/compute_template.yaml
  4558. run:
  4559. timeout: 4200
  4560. wait_for_nodes:
  4561. num_nodes: 10
  4562. prepare: python setup_chaos.py --no-start
  4563. script: python chaos_test/test_chaos_basic.py --workload=actors
  4564. variations:
  4565. - __suffix__: aws
  4566. - __suffix__: gce
  4567. env: gce
  4568. frequency: manual
  4569. cluster:
  4570. cluster_env: chaos_test/app_config.yaml
  4571. cluster_compute: chaos_test/compute_template_gce.yaml
  4572. - name: chaos_dask_on_ray_large_scale_test_no_spilling
  4573. group: data-tests
  4574. working_dir: nightly_tests
  4575. frequency: nightly
  4576. team: data
  4577. cluster:
  4578. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4579. cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
  4580. run:
  4581. timeout: 7200
  4582. wait_for_nodes:
  4583. num_nodes: 21
  4584. prepare: python setup_chaos.py --node-kill-interval 100
  4585. script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
  4586. 20 --error_rate 0 --data_save_path /tmp/ray
  4587. variations:
  4588. - __suffix__: aws
  4589. - __suffix__: gce
  4590. env: gce
  4591. frequency: manual
  4592. cluster:
  4593. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4594. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  4595. - name: chaos_dask_on_ray_large_scale_test_spilling
  4596. group: data-tests
  4597. working_dir: nightly_tests
  4598. frequency: nightly
  4599. team: data
  4600. cluster:
  4601. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4602. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  4603. run:
  4604. timeout: 7200
  4605. wait_for_nodes:
  4606. num_nodes: 21
  4607. prepare: python setup_chaos.py --node-kill-interval 100
  4608. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  4609. 70 --error_rate 0 --data_save_path /tmp/ray
  4610. variations:
  4611. - __suffix__: aws
  4612. - __suffix__: gce
  4613. env: gce
  4614. frequency: manual
  4615. cluster:
  4616. cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
  4617. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  4618. - name: chaos_pipelined_ingestion_1500_gb_15_windows
  4619. group: data-tests
  4620. working_dir: nightly_tests
  4621. frequency: nightly
  4622. team: data
  4623. cluster:
  4624. cluster_env: dataset/pipelined_ingestion_app.yaml
  4625. cluster_compute: dataset/pipelined_ingestion_compute.yaml
  4626. run:
  4627. timeout: 7200
  4628. wait_for_nodes:
  4629. num_nodes: 21
  4630. prepare: ' python setup_chaos.py --node-kill-interval 300'
  4631. script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files
  4632. 915 --debug
  4633. variations:
  4634. - __suffix__: aws
  4635. - __suffix__: gce
  4636. env: gce
  4637. frequency: manual
  4638. cluster:
  4639. cluster_env: dataset/pipelined_ingestion_app.yaml
  4640. cluster_compute: dataset/pipelined_ingestion_compute_gce.yaml
  4641. - name: chaos_dataset_shuffle_push_based_sort_1tb
  4642. group: data-tests
  4643. working_dir: nightly_tests
  4644. stable: false
  4645. frequency: nightly
  4646. team: data
  4647. cluster:
  4648. cluster_env: shuffle/shuffle_app_config.yaml
  4649. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4650. run:
  4651. timeout: 7200
  4652. prepare: ' python setup_chaos.py --node-kill-interval 1200 --max-nodes-to-kill 3'
  4653. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4654. wait_for_nodes:
  4655. num_nodes: 20
  4656. variations:
  4657. - __suffix__: aws
  4658. - __suffix__: gce
  4659. env: gce
  4660. frequency: manual
  4661. cluster:
  4662. cluster_env: shuffle/shuffle_app_config.yaml
  4663. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4664. - name: chaos_dataset_shuffle_sort_1tb
  4665. group: data-tests
  4666. working_dir: nightly_tests
  4667. frequency: nightly
  4668. team: data
  4669. cluster:
  4670. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4671. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4672. run:
  4673. timeout: 7200
  4674. prepare: 'python setup_chaos.py --node-kill-interval 900 --max-nodes-to-kill 3'
  4675. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  4676. wait_for_nodes:
  4677. num_nodes: 20
  4678. variations:
  4679. - __suffix__: aws
  4680. - __suffix__: gce
  4681. env: gce
  4682. frequency: manual
  4683. cluster:
  4684. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4685. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4686. - name: chaos_dataset_shuffle_random_shuffle_1tb
  4687. group: data-tests
  4688. working_dir: nightly_tests
  4689. stable: false
  4690. frequency: nightly
  4691. team: data
  4692. cluster:
  4693. # leave oom disabled as test is marked unstable at the moment.
  4694. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4695. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4696. run:
  4697. timeout: 7200
  4698. prepare: ' python setup_chaos.py --node-kill-interval 600 --max-nodes-to-kill 2'
  4699. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4700. wait_for_nodes:
  4701. num_nodes: 20
  4702. variations:
  4703. - __suffix__: aws
  4704. - __suffix__: gce
  4705. env: gce
  4706. frequency: manual
  4707. cluster:
  4708. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4709. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4710. - name: chaos_dataset_shuffle_push_based_random_shuffle_1tb
  4711. group: data-tests
  4712. working_dir: nightly_tests
  4713. stable: false
  4714. frequency: nightly
  4715. team: data
  4716. cluster:
  4717. # leave oom disabled as test is marked unstable at the moment.
  4718. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4719. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  4720. run:
  4721. timeout: 7200
  4722. prepare: ' python setup_chaos.py --node-kill-interval 600 --max-nodes-to-kill 2'
  4723. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  4724. wait_for_nodes:
  4725. num_nodes: 20
  4726. variations:
  4727. - __suffix__: aws
  4728. - __suffix__: gce
  4729. env: gce
  4730. frequency: manual
  4731. cluster:
  4732. cluster_env: shuffle/shuffle_app_config_oom_disabled.yaml
  4733. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  4734. #####################
  4735. # Observability tests
  4736. #####################
  4737. - name: agent_stress_test
  4738. group: core-observability-test
  4739. working_dir: dashboard
  4740. stable: true
  4741. frequency: nightly
  4742. team: core
  4743. cluster:
  4744. cluster_env: agent_stress_app_config.yaml
  4745. cluster_compute: agent_stress_compute.yaml
  4746. run:
  4747. timeout: 14400
  4748. script: python mem_check.py --working-dir .
  4749. variations:
  4750. - __suffix__: aws
  4751. - __suffix__: gce
  4752. env: gce
  4753. frequency: manual
  4754. cluster:
  4755. cluster_env: agent_stress_app_config.yaml
  4756. cluster_compute: agent_stress_compute_gce.yaml
  4757. - name: k8s_serve_ha_test
  4758. group: k8s-test
  4759. working_dir: k8s_tests
  4760. stable: false
  4761. frequency: nightly
  4762. team: serve
  4763. cluster:
  4764. cluster_env: app_config.yaml
  4765. cluster_compute: compute_tpl.yaml
  4766. run:
  4767. timeout: 28800 # 8h
  4768. prepare: bash prepare.sh
  4769. script: python run_gcs_ft_on_k8s.py
  4770. - name: aws_cluster_launcher
  4771. group: cluster-launcher-test
  4772. working_dir: ../python/ray/autoscaler/aws/
  4773. stable: true
  4774. frequency: nightly
  4775. team: core
  4776. cluster:
  4777. cluster_env: tests/aws_config.yaml
  4778. cluster_compute: tests/aws_compute.yaml
  4779. run:
  4780. timeout: 1200
  4781. script: cd tests && python aws_launch_and_verify_cluster.py aws_cluster.yaml
  4782. - name: aws_cluster_launcher_minimal
  4783. group: cluster-launcher-test
  4784. working_dir: ../python/ray/autoscaler/aws/
  4785. stable: true
  4786. frequency: nightly
  4787. team: core
  4788. cluster:
  4789. cluster_env: tests/aws_config.yaml
  4790. cluster_compute: tests/aws_compute.yaml
  4791. run:
  4792. timeout: 1200
  4793. script: cd tests && python aws_launch_and_verify_cluster.py ../example-minimal.yaml
  4794. - name: aws_cluster_launcher_full
  4795. group: cluster-launcher-test
  4796. working_dir: ../python/ray/autoscaler/aws/
  4797. stable: true
  4798. frequency: nightly
  4799. team: core
  4800. cluster:
  4801. cluster_env: tests/aws_config.yaml
  4802. cluster_compute: tests/aws_compute.yaml
  4803. run:
  4804. timeout: 1200
  4805. script: cd tests && python aws_launch_and_verify_cluster.py ../example-full.yaml