release_tests.yaml 110 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046
  1. # Global release test configuration file.
  2. # All your release test configuration should go here. Adding release tests here
  3. # will automatically enable them in the Buildkite release testing schedules
  4. # (except they have frequency: manual).
  5. # Here is an example configuration for reference:
  6. #- name: example_test
  7. # # Tests with the same group will be grouped in the Buildkite UI
  8. # group: Example group
  9. # # Provide the working directory which will be uploaded to the cluster
  10. # working_dir: example_dir
  11. #
  12. # # How often to run the tests.
  13. # # One of [manual, any, multi, nightly, nightly-3x, weekly].
  14. # # Descriptions of each frequency (that's not immediately obvious):
  15. # # - manual: Not run on a schedule, but can be manually run through the buildkite UI.
  16. # # - nightly-3x: Run 3 times a week (Monday, Wednesday, Friday).
  17. # frequency: weekly
  18. # # Owning team. This field will be persisted to the database
  19. # team: ml
  20. #
  21. # # Python version. This optional field determines which Python version to run tests
  22. # # on. This must be a string!
  23. # python: "3.7"
  24. #
  25. # # Cluster information
  26. # cluster:
  27. # # Location of cluster compute, relative to working_dir
  28. # cluster_compute: cluster_compute.yaml
  29. # # Autosuspend parameter passed to the cluster.
  30. # # The cluster will automatically terminate if inactive for this
  31. # # many minutes. Defaults to 10 if not set.
  32. # autosuspend_mins: 10
  33. # # Optional cloud_id to use instead of the default cloud
  34. # cloud_id: cld_12345678
  35. # # Alternatively, you can specify a cloud name
  36. # cloud_name: anyscale_default_cloud
  37. #
  38. # # Run configuration for the test
  39. # run:
  40. # # If you want to wait for nodes to be ready, you can specify this here:
  41. # wait_for_nodes:
  42. # # Number of nodes
  43. # num_nodes: 16
  44. # # Timeout for waiting for nodes. If nodes are not up by then, the
  45. # # test will fail.
  46. # timeout: 600
  47. #
  48. # # Optional prepare script to be run on the cluster before the test script
  49. # prepare: python prepare.py
  50. # # The prepare command can have a separate timeout
  51. # prepare_timeout: 300
  52. #
  53. # # Main script to run as the test script
  54. # script: python workloads/train_small.py
  55. # # Timeout in seconds. After this time the test is considered as failed.
  56. # timeout: 600
  57. #
  58. # # You can specify smoke test definitions here. If a smoke test is triggered,
  59. # # it will deep update the main test configuration with the values provided
  60. # # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
  61. # # environment variable and receive the --smoke-test flag as a parameter in the
  62. # # run script.
  63. # smoke_test:
  64. # # Smoke tests can have different frequencies. A smoke test is only triggered
  65. # # when the regular test is not matched.
  66. # frequency: nightly
  67. # # Here we adjust the run timeout down and run on less nodes. The test script
  68. # # remains the same.
  69. # run:
  70. # timeout: 300
  71. # wait_for_nodes:
  72. # num_nodes: 4
  73. # timeout: 600
  74. #
  75. # # After the test finished, this handler (in alerts/) will process the results.
  76. # # It can then let the test fail, e.g. if a metric regression is observed.
  77. # alert: default
  78. #######################
  79. # Cluster scaling tests
  80. #######################
  81. - name: cluster_tune_scale_up_down
  82. group: Cluster tests
  83. working_dir: cluster_tests
  84. frequency: nightly
  85. team: ml
  86. cluster:
  87. byod: {}
  88. cluster_compute: cpt_autoscaling_1-3_aws.yaml
  89. run:
  90. timeout: 3600
  91. script: python workloads/tune_scale_up_down.py
  92. wait_for_nodes:
  93. num_nodes: 0
  94. variations:
  95. - __suffix__: aws
  96. - __suffix__: gce
  97. env: gce
  98. frequency: manual
  99. cluster:
  100. cluster_compute: cpt_autoscaling_1-3_gce.yaml
  101. alert: default
  102. ############################
  103. # Batch Inference Benchmarks
  104. ############################
  105. # 10 GB image classification raw images with 1 GPU.
  106. # 1 g4dn.4xlarge
  107. - name: torch_batch_inference_1_gpu_10gb_raw
  108. group: data-tests
  109. working_dir: nightly_tests/dataset
  110. frequency: nightly
  111. team: data
  112. cluster:
  113. byod:
  114. type: gpu
  115. cluster_compute: compute_gpu_1_cpu_16_aws.yaml
  116. run:
  117. timeout: 500
  118. script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw
  119. alert: default
  120. variations:
  121. - __suffix__: aws
  122. - __suffix__: gce
  123. env: gce
  124. frequency: manual
  125. cluster:
  126. cluster_compute: compute_gpu_1_cpu_16_gce.yaml
  127. # 10 GB image classification parquet with 1 GPU.
  128. # 1 g4dn.4xlarge
  129. - name: torch_batch_inference_1_gpu_10gb_parquet
  130. group: data-tests
  131. working_dir: nightly_tests/dataset
  132. frequency: nightly
  133. team: data
  134. cluster:
  135. byod:
  136. type: gpu
  137. cluster_compute: compute_gpu_1_cpu_16_aws.yaml
  138. run:
  139. timeout: 500
  140. script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet
  141. alert: default
  142. variations:
  143. - __suffix__: aws
  144. - __suffix__: gce
  145. env: gce
  146. frequency: manual
  147. cluster:
  148. cluster_compute: compute_gpu_1_cpu_16_gce.yaml
  149. # 300 GB image classification raw images with 16 GPUs
  150. # 4 g4dn.12xlarge
  151. - name: torch_batch_inference_16_gpu_300gb_raw
  152. group: data-tests
  153. working_dir: nightly_tests/dataset
  154. frequency: nightly
  155. team: data
  156. cluster:
  157. byod:
  158. type: gpu
  159. cluster_compute: compute_gpu_4x4_aws.yaml
  160. run:
  161. timeout: 1000
  162. script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
  163. wait_for_nodes:
  164. num_nodes: 4
  165. alert: default
  166. variations:
  167. - __suffix__: aws
  168. - __suffix__: gce
  169. env: gce
  170. frequency: manual
  171. cluster:
  172. cluster_compute: compute_gpu_4x4_gce.yaml
  173. - name: chaos_torch_batch_inference_16_gpu_300gb_raw
  174. group: data-tests
  175. working_dir: nightly_tests
  176. stable: false
  177. frequency: nightly
  178. team: data
  179. cluster:
  180. byod:
  181. type: gpu
  182. cluster_compute: dataset/compute_gpu_4x4_aws.yaml
  183. run:
  184. timeout: 1000
  185. prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
  186. script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
  187. wait_for_nodes:
  188. num_nodes: 4
  189. alert: default
  190. variations:
  191. - __suffix__: aws
  192. - __suffix__: gce
  193. env: gce
  194. frequency: manual
  195. cluster:
  196. cluster_compute: dataset/compute_gpu_4x4_gce.yaml
  197. # 300 GB image classification parquet data with 16 GPUs
  198. # 4 g4dn.12xlarge
  199. - name: torch_batch_inference_16_gpu_300gb_parquet
  200. group: data-tests
  201. working_dir: nightly_tests/dataset
  202. frequency: nightly
  203. team: data
  204. cluster:
  205. byod:
  206. type: gpu
  207. cluster_compute: compute_gpu_4x4_aws.yaml
  208. run:
  209. timeout: 1000
  210. script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
  211. wait_for_nodes:
  212. num_nodes: 4
  213. alert: default
  214. variations:
  215. - __suffix__: aws
  216. - __suffix__: gce
  217. env: gce
  218. frequency: manual
  219. cluster:
  220. cluster_compute: compute_gpu_4x4_gce.yaml
  221. # 10 TB image classification parquet data with heterogenous cluster
  222. # 10 g4dn.12xlarge, 10 m5.16xlarge
  223. - name: torch_batch_inference_hetero_10tb_parquet
  224. group: data-tests
  225. working_dir: nightly_tests/dataset
  226. frequency: weekly
  227. team: data
  228. cluster:
  229. byod:
  230. type: gpu
  231. cluster_compute: compute_hetero_10x10_aws.yaml
  232. run:
  233. timeout: 2000
  234. script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
  235. wait_for_nodes:
  236. num_nodes: 20
  237. alert: default
  238. #########################
  239. # AIR release tests
  240. #########################
  241. - name: tune_with_frequent_pausing
  242. group: AIR tests
  243. working_dir: air_tests
  244. frequency: nightly-3x
  245. team: ml
  246. cluster:
  247. byod:
  248. runtime_env:
  249. - RAY_memory_usage_threshold=0.5
  250. - automatic_object_spilling_enabled=0
  251. cluster_compute: frequent_pausing/compute_config_aws.yaml
  252. run:
  253. timeout: 600 # 10min
  254. long_running: true
  255. script: python frequent_pausing/script.py
  256. variations:
  257. - __suffix__: aws
  258. - __suffix__: gce
  259. env: gce
  260. frequency: manual
  261. cluster:
  262. cluster_compute: frequent_pausing/compute_config_gce.yaml
  263. alert: default
  264. - name: long_running_horovod_tune_test
  265. group: AIR tests
  266. working_dir: air_tests
  267. frequency: weekly
  268. team: ml
  269. cluster:
  270. byod:
  271. type: gpu
  272. post_build_script: byod_horovod_master_test.sh
  273. cluster_compute: horovod/compute_tpl_aws.yaml
  274. variations:
  275. - __suffix__: aws
  276. - __suffix__: gce
  277. env: gce
  278. frequency: manual
  279. cluster:
  280. cluster_compute: horovod/compute_tpl_gce.yaml
  281. run:
  282. timeout: 36000
  283. script: python horovod/workloads/horovod_tune_test.py
  284. long_running: true
  285. wait_for_nodes:
  286. num_nodes: 2
  287. smoke_test:
  288. frequency: manual
  289. run:
  290. timeout: 3600
  291. alert: default
  292. # Ray AIR distributed Torch benchmarks
  293. - name: air_benchmark_torch_mnist_cpu_4x1
  294. group: AIR tests
  295. working_dir: air_tests/air_benchmarks
  296. frequency: nightly
  297. team: ml
  298. cluster:
  299. byod:
  300. type: gpu
  301. cluster_compute: compute_cpu_4_aws.yaml
  302. run:
  303. timeout: 3600
  304. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  305. wait_for_nodes:
  306. num_nodes: 4
  307. variations:
  308. - __suffix__: aws
  309. - __suffix__: gce
  310. env: gce
  311. frequency: manual
  312. cluster:
  313. cluster_compute: compute_cpu_4_gce.yaml
  314. alert: default
  315. - name: air_benchmark_torch_mnist_gpu_4x4
  316. group: AIR tests
  317. working_dir: air_tests/air_benchmarks
  318. frequency: weekly
  319. team: ml
  320. cluster:
  321. byod:
  322. type: gpu
  323. cluster_compute: compute_gpu_4x4_aws.yaml
  324. run:
  325. timeout: 4800
  326. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 120 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  327. wait_for_nodes:
  328. num_nodes: 4
  329. smoke_test:
  330. frequency: nightly
  331. cluster:
  332. cluster_compute: compute_gpu_2x2_aws.yaml
  333. run:
  334. timeout: 3600
  335. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  336. wait_for_nodes:
  337. num_nodes: 2
  338. variations:
  339. - __suffix__: aws
  340. - __suffix__: gce
  341. env: gce
  342. frequency: manual
  343. cluster:
  344. cluster_compute: compute_gpu_4x4_gce.yaml
  345. smoke_test:
  346. frequency: manual
  347. alert: default
  348. - name: air_benchmark_torch_mnist_cpu_1x4
  349. group: AIR tests
  350. working_dir: air_tests/air_benchmarks
  351. frequency: nightly
  352. team: ml
  353. cluster:
  354. byod:
  355. type: gpu
  356. cluster_compute: compute_cpu_1_aws.yaml
  357. run:
  358. timeout: 3600
  359. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  360. variations:
  361. - __suffix__: aws
  362. - __suffix__: gce
  363. env: gce
  364. frequency: manual
  365. cluster:
  366. cluster_compute: compute_cpu_1_gce.yaml
  367. alert: default
  368. - name: air_benchmark_torch_mnist_cpu_4x4
  369. group: AIR tests
  370. working_dir: air_tests/air_benchmarks
  371. frequency: nightly
  372. team: ml
  373. cluster:
  374. byod:
  375. type: gpu
  376. cluster_compute: compute_cpu_4_aws.yaml
  377. run:
  378. timeout: 5400
  379. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  380. wait_for_nodes:
  381. num_nodes: 4
  382. variations:
  383. - __suffix__: aws
  384. - __suffix__: gce
  385. env: gce
  386. frequency: manual
  387. cluster:
  388. cluster_compute: compute_cpu_4_gce.yaml
  389. alert: default
  390. - name: air_benchmark_tune_torch_mnist
  391. group: AIR tests
  392. working_dir: air_tests/air_benchmarks
  393. frequency: nightly
  394. team: ml
  395. cluster:
  396. byod:
  397. type: gpu
  398. cluster_compute: compute_cpu_8_aws.yaml
  399. run:
  400. timeout: 3600
  401. script: python workloads/tune_torch_benchmark.py --num-runs 3 --num-trials 8 --num-workers 4
  402. wait_for_nodes:
  403. num_nodes: 8
  404. variations:
  405. - __suffix__: aws
  406. - __suffix__: gce
  407. env: gce
  408. frequency: manual
  409. cluster:
  410. cluster_compute: compute_cpu_8_gce.yaml
  411. alert: default
  412. - name: air_benchmark_tune_torch_mnist_gpu
  413. group: AIR tests
  414. working_dir: air_tests/air_benchmarks
  415. frequency: nightly
  416. team: ml
  417. cluster:
  418. byod:
  419. type: gpu
  420. cluster_compute: compute_gpu_4x4_aws.yaml
  421. run:
  422. timeout: 3600
  423. script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 4 --use-gpu
  424. wait_for_nodes:
  425. num_nodes: 4
  426. variations:
  427. - __suffix__: aws
  428. - __suffix__: gce
  429. env: gce
  430. frequency: manual
  431. cluster:
  432. cluster_compute: compute_gpu_4x4_gce.yaml
  433. alert: default
  434. # Ray AIR distributed Tensorflow benchmarks
  435. - name: air_benchmark_tensorflow_mnist_cpu_4x1
  436. group: AIR tests
  437. working_dir: air_tests/air_benchmarks
  438. frequency: nightly
  439. team: ml
  440. cluster:
  441. byod:
  442. type: gpu
  443. cluster_compute: compute_cpu_4_aws.yaml
  444. run:
  445. timeout: 5400
  446. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  447. wait_for_nodes:
  448. num_nodes: 4
  449. variations:
  450. - __suffix__: aws
  451. - __suffix__: gce
  452. env: gce
  453. frequency: manual
  454. cluster:
  455. cluster_compute: compute_cpu_4_gce.yaml
  456. alert: default
  457. - name: air_benchmark_tensorflow_mnist_cpu_1x4
  458. group: AIR tests
  459. working_dir: air_tests/air_benchmarks
  460. frequency: nightly
  461. team: ml
  462. cluster:
  463. byod:
  464. type: gpu
  465. cluster_compute: compute_cpu_1_aws.yaml
  466. run:
  467. timeout: 5400
  468. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  469. variations:
  470. - __suffix__: aws
  471. - __suffix__: gce
  472. env: gce
  473. frequency: manual
  474. cluster:
  475. cluster_compute: compute_cpu_1_gce.yaml
  476. alert: default
  477. - name: air_benchmark_tensorflow_mnist_cpu_4x4
  478. group: AIR tests
  479. working_dir: air_tests/air_benchmarks
  480. frequency: nightly
  481. team: ml
  482. stable: false
  483. cluster:
  484. byod:
  485. type: gpu
  486. cluster_compute: compute_cpu_4_aws.yaml
  487. run:
  488. timeout: 5400
  489. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  490. wait_for_nodes:
  491. num_nodes: 4
  492. variations:
  493. - __suffix__: aws
  494. - __suffix__: gce
  495. env: gce
  496. frequency: manual
  497. cluster:
  498. cluster_compute: compute_cpu_4_gce.yaml
  499. alert: default
  500. - name: air_benchmark_tensorflow_mnist_gpu_4x4
  501. group: AIR tests
  502. working_dir: air_tests/air_benchmarks
  503. frequency: weekly
  504. team: ml
  505. stable: false
  506. cluster:
  507. byod:
  508. type: gpu
  509. cluster_compute: compute_gpu_4x4_aws.yaml
  510. run:
  511. timeout: 5400
  512. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  513. wait_for_nodes:
  514. num_nodes: 4
  515. smoke_test:
  516. frequency: nightly
  517. cluster:
  518. cluster_compute: compute_gpu_2x2_aws.yaml
  519. run:
  520. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  521. wait_for_nodes:
  522. num_nodes: 2
  523. variations:
  524. - __suffix__: aws
  525. - __suffix__: gce
  526. env: gce
  527. frequency: manual
  528. cluster:
  529. cluster_compute: compute_gpu_4x4_gce.yaml
  530. smoke_test:
  531. frequency: manual
  532. alert: default
  533. - name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
  534. group: AIR tests
  535. working_dir: air_tests/air_benchmarks
  536. frequency: nightly
  537. team: ml
  538. cluster:
  539. byod:
  540. type: gpu
  541. cluster_compute: compute_gpu_1_aws.yaml
  542. run:
  543. timeout: 3600
  544. script: python workloads/pytorch_training_e2e.py --data-size-gb 20
  545. alert: default
  546. variations:
  547. - __suffix__: aws
  548. - __suffix__: gce
  549. env: gce
  550. frequency: manual
  551. cluster:
  552. cluster_compute: compute_gpu_1_gce.yaml
  553. - name: air_benchmark_pytorch_training_e2e_gpu_4x4_100gb
  554. group: AIR tests
  555. working_dir: air_tests/air_benchmarks
  556. frequency: nightly
  557. team: ml
  558. stable: false
  559. cluster:
  560. byod:
  561. type: gpu
  562. cluster_compute: compute_gpu_4x4_aws.yaml
  563. run:
  564. timeout: 10800
  565. script: python workloads/pytorch_training_e2e.py --data-size-gb=100 --num-workers=16
  566. wait_for_nodes:
  567. num_nodes: 4
  568. alert: default
  569. variations:
  570. - __suffix__: aws
  571. - __suffix__: gce
  572. env: gce
  573. frequency: manual
  574. cluster:
  575. cluster_compute: compute_gpu_4x4_gce.yaml
  576. # Test tiny, and medium input files to check that performance stays about
  577. # constant.
  578. - name: ray-data-resnet50-ingest-file-size-benchmark
  579. group: AIR tests
  580. working_dir: air_tests/air_benchmarks/mlperf-train
  581. frequency: nightly
  582. team: data
  583. cluster:
  584. byod:
  585. type: gpu
  586. runtime_env:
  587. - RAY_task_oom_retries=50
  588. - RAY_min_memory_free_bytes=1000000000
  589. cluster_compute: compute_cpu_16.yaml
  590. run:
  591. timeout: 3600
  592. script: bash file_size_benchmark.sh
  593. variations:
  594. - __suffix__: aws
  595. - __suffix__: gce
  596. env: gce
  597. frequency: manual
  598. cluster:
  599. cluster_compute: compute_gce_cpu_16.yaml
  600. # Test huge files to check that we do not OOM.
  601. - name: ray-data-resnet50-ingest-out-of-memory-benchmark
  602. group: AIR tests
  603. working_dir: air_tests/air_benchmarks/mlperf-train
  604. stable: false
  605. frequency: nightly
  606. team: data
  607. cluster:
  608. byod:
  609. type: gpu
  610. runtime_env:
  611. - RAY_task_oom_retries=50
  612. - RAY_min_memory_free_bytes=1000000000
  613. cluster_compute: compute_cpu_16.yaml
  614. run:
  615. timeout: 3600
  616. script: bash oom_benchmark.sh
  617. variations:
  618. - __suffix__: aws
  619. - __suffix__: gce
  620. env: gce
  621. frequency: manual
  622. cluster:
  623. cluster_compute: compute_gce_cpu_16.yaml
  624. #######################
  625. # AIR examples
  626. #######################
  627. # Test additional CPU nodes for preprocessing.
  628. - name: air_example_dreambooth_finetuning
  629. group: AIR examples
  630. working_dir: air_examples/dreambooth
  631. stable: false
  632. frequency: weekly
  633. team: ml
  634. cluster:
  635. byod:
  636. type: gpu
  637. cluster_compute: dreambooth_compute_aws.yaml
  638. run:
  639. timeout: 1800
  640. script: pip install -Ur dreambooth/requirements.txt && bash dreambooth_run.sh
  641. artifact_path: /tmp/artifacts/example_out.jpg
  642. # variations: A10G not available on GCE, yet.
  643. - name: air_example_dreambooth_finetuning_lora
  644. group: AIR examples
  645. working_dir: air_examples/dreambooth
  646. stable: false
  647. frequency: weekly
  648. team: ml
  649. cluster:
  650. byod:
  651. type: gpu
  652. cluster_compute: dreambooth_compute_aws.yaml
  653. run:
  654. timeout: 1800
  655. script: pip install -Ur dreambooth/requirements.txt && bash dreambooth_run.sh --lora
  656. artifact_path: /tmp/artifacts/example_out.jpg
  657. - name: air_example_gptj_deepspeed_fine_tuning
  658. group: AIR examples
  659. working_dir: air_examples/gptj_deepspeed_finetuning
  660. frequency: weekly
  661. team: ml
  662. cluster:
  663. byod:
  664. type: gpu
  665. pip:
  666. - myst-parser==0.15.2
  667. - myst-nb==0.13.1
  668. - jupytext==1.13.6
  669. cluster_compute: gptj_deepspeed_compute_aws.yaml
  670. run:
  671. timeout: 4500
  672. script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb
  673. - name: air_example_dolly_v2_lightning_fsdp_finetuning
  674. group: AIR examples
  675. working_dir: air_examples/dolly_v2_lightning_fsdp_finetuning
  676. frequency: weekly
  677. team: ml
  678. cluster:
  679. byod:
  680. type: gpu
  681. pip:
  682. - "datasets"
  683. - "evaluate"
  684. - "scikit-learn"
  685. - "boto3"
  686. - myst-parser==0.15.2
  687. - myst-nb==0.13.1
  688. - jupytext==1.13.6
  689. post_build_script: byod_dolly_test.sh
  690. cluster_compute: dolly_v2_fsdp_compute_aws.yaml
  691. run:
  692. timeout: 4700
  693. script: python test_myst_doc.py --path lightning-llm-finetuning-7b.ipynb
  694. # variations: TODO(jungong): add GCP variation.
  695. - name: air_example_vicuna_13b_lightning_deepspeed_finetuning
  696. group: AIR examples
  697. working_dir: air_examples/vicuna_13b_lightning_deepspeed_finetuning
  698. frequency: weekly
  699. team: ml
  700. cluster:
  701. byod:
  702. type: gpu
  703. pip:
  704. - myst-parser==0.15.2
  705. - myst-nb==0.13.1
  706. post_build_script: byod_vicuna_test.sh
  707. cluster_compute: vicuna_13b_deepspeed_compute_aws.yaml
  708. run:
  709. timeout: 4700
  710. script: python test_myst_doc.py --path vicuna_13b_lightning_deepspeed_finetune.ipynb
  711. #####################################
  712. # Workspace templates release tests #
  713. #####################################
  714. - name: workspace_template_batch_inference
  715. group: Workspace templates
  716. working_dir: workspace_templates/01_batch_inference
  717. frequency: nightly-3x
  718. team: data
  719. cluster:
  720. byod:
  721. type: gpu
  722. cluster_compute: ../testing/compute_configs/gpu/aws.yaml
  723. run:
  724. timeout: 600
  725. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  726. variations:
  727. - __suffix__: aws
  728. - __suffix__: gce
  729. env: gce
  730. frequency: manual
  731. cluster:
  732. cluster_compute: ../testing/compute_configs/gpu/gce.yaml
  733. - name: workspace_template_many_model_training
  734. group: Workspace templates
  735. working_dir: workspace_templates/02_many_model_training
  736. frequency: nightly-3x
  737. team: ml
  738. cluster:
  739. byod:
  740. type: gpu
  741. cluster_compute: ../testing/compute_configs/cpu/aws.yaml
  742. run:
  743. timeout: 600
  744. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  745. variations:
  746. - __suffix__: aws
  747. - __suffix__: gce
  748. env: gce
  749. frequency: manual
  750. cluster:
  751. cluster_compute: ../testing/compute_configs/cpu/gce.yaml
  752. - name: workspace_template_serving_stable_diffusion
  753. group: Workspace templates
  754. working_dir: workspace_templates/03_serving_stable_diffusion
  755. frequency: nightly-3x
  756. team: serve
  757. cluster:
  758. byod:
  759. type: gpu
  760. post_build_script: byod_stable_diffusion.sh
  761. cluster_compute: ../testing/compute_configs/gpu/aws.yaml
  762. run:
  763. timeout: 600
  764. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py && serve run app:entrypoint --non-blocking && python query.py
  765. variations:
  766. - __suffix__: aws
  767. - __suffix__: gce
  768. env: gce
  769. frequency: manual
  770. cluster:
  771. cluster_compute: ../testing/compute_configs/gpu/gce.yaml
  772. - name: workspace_template_finetuning_llms_with_deepspeed_llama_2_7b
  773. group: Workspace templates
  774. working_dir: workspace_templates/04_finetuning_llms_with_deepspeed
  775. frequency: nightly-3x
  776. team: ml
  777. cluster:
  778. byod:
  779. type: cu123
  780. # This needs to be in sync with requirements under go/llm-forge.
  781. post_build_script: byod_finetune_llvms.sh
  782. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
  783. run:
  784. timeout: 1000
  785. script: chmod +x ./run_llama_ft.sh && ./run_llama_ft.sh --size=7b --as-test
  786. variations:
  787. - __suffix__: aws
  788. - __suffix__: gce
  789. env: gce
  790. frequency: manual
  791. cluster:
  792. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/gce_7b.yaml
  793. - name: workspace_template_finetuning_llms_with_deepspeed_llama_2_7b_lora
  794. group: Workspace templates
  795. working_dir: workspace_templates/04_finetuning_llms_with_deepspeed
  796. frequency: nightly-3x
  797. team: ml
  798. cluster:
  799. byod:
  800. type: cu123
  801. # This needs to be in sync with requirements under go/llm-forge.
  802. post_build_script: byod_finetune_llvms.sh
  803. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
  804. run:
  805. timeout: 1000
  806. script: chmod +x ./run_llama_ft.sh && ./run_llama_ft.sh --size=7b --lora --as-test
  807. variations:
  808. - __suffix__: aws
  809. - __suffix__: gce
  810. env: gce
  811. frequency: manual
  812. cluster:
  813. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/gce_7b.yaml
  814. #######################
  815. # ML user tests
  816. #######################
  817. - name: ml_user_horovod_user_test_latest
  818. group: ML user tests
  819. working_dir: ml_user_tests
  820. frequency: nightly-3x
  821. team: ml
  822. cluster:
  823. byod:
  824. type: gpu
  825. post_build_script: byod_horovod_test.sh
  826. cluster_compute: horovod/compute_tpl_aws.yaml
  827. run:
  828. timeout: 1200
  829. script: python horovod/horovod_user_test.py
  830. wait_for_nodes:
  831. num_nodes: 4
  832. variations:
  833. - __suffix__: aws
  834. - __suffix__: gce
  835. env: gce
  836. frequency: manual
  837. cluster:
  838. cluster_compute: horovod/compute_tpl_gce.yaml
  839. alert: default
  840. - name: ml_user_horovod_user_test_master
  841. group: ML user tests
  842. working_dir: ml_user_tests
  843. frequency: nightly-3x
  844. team: ml
  845. cluster:
  846. byod:
  847. type: gpu
  848. post_build_script: byod_horovod_master_test.sh
  849. cluster_compute: horovod/compute_tpl_aws.yaml
  850. run:
  851. timeout: 1200
  852. script: python horovod/horovod_user_test.py
  853. wait_for_nodes:
  854. num_nodes: 4
  855. variations:
  856. - __suffix__: aws
  857. - __suffix__: gce
  858. env: gce
  859. frequency: manual
  860. cluster:
  861. cluster_compute: horovod/compute_tpl_gce.yaml
  862. alert: default
  863. - name: ml_user_train_tensorflow_mnist_test
  864. group: ML user tests
  865. working_dir: ml_user_tests
  866. frequency: nightly-3x
  867. team: ml
  868. cluster:
  869. byod:
  870. runtime_env:
  871. - TRAIN_PLACEMENT_GROUP_TIMEOUT_S=2000
  872. type: gpu
  873. cluster_compute: train/compute_tpl_aws.yaml
  874. run:
  875. timeout: 36000
  876. script: python train/train_tensorflow_mnist_test.py
  877. wait_for_nodes:
  878. num_nodes: 3
  879. variations:
  880. - __suffix__: aws
  881. - __suffix__: gce
  882. env: gce
  883. frequency: manual
  884. cluster:
  885. cluster_compute: train/compute_tpl_gce.yaml
  886. alert: default
  887. - name: ml_user_train_torch_linear_test
  888. group: ML user tests
  889. working_dir: ml_user_tests
  890. frequency: nightly-3x
  891. team: ml
  892. cluster:
  893. byod:
  894. runtime_env:
  895. - TRAIN_PLACEMENT_GROUP_TIMEOUT_S=2000
  896. type: gpu
  897. cluster_compute: train/compute_tpl_aws.yaml
  898. run:
  899. timeout: 36000
  900. script: python train/train_torch_linear_test.py
  901. wait_for_nodes:
  902. num_nodes: 3
  903. variations:
  904. - __suffix__: aws
  905. - __suffix__: gce
  906. env: gce
  907. frequency: manual
  908. cluster:
  909. cluster_compute: train/compute_tpl_gce.yaml
  910. alert: default
  911. - name: ml_user_tune_rllib_connect_test
  912. group: ML user tests
  913. working_dir: ml_user_tests
  914. frequency: nightly-3x
  915. team: ml
  916. cluster:
  917. byod:
  918. type: gpu
  919. post_build_script: byod_rllib_test.sh
  920. runtime_env:
  921. - RLLIB_TEST_NO_JAX_IMPORT=1
  922. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  923. cluster_compute: tune_rllib/compute_tpl_aws.yaml
  924. run:
  925. timeout: 2000
  926. script: python tune_rllib/run_connect_tests.py
  927. wait_for_nodes:
  928. num_nodes: 9
  929. variations:
  930. - __suffix__: aws
  931. - __suffix__: gce
  932. env: gce
  933. frequency: manual
  934. cluster:
  935. cluster_compute: tune_rllib/compute_tpl_gce.yaml
  936. alert: default
  937. #######################
  938. # Tune cloud tests
  939. #######################
  940. - name: tune_cloud_long_running_cloud_storage
  941. group: Tune cloud tests
  942. working_dir: tune_tests/cloud_tests
  943. frequency: weekly
  944. team: ml
  945. cluster:
  946. byod: {}
  947. cluster_compute: tpl_aws_1x4.yaml
  948. run:
  949. # 14 hours
  950. timeout: 50400
  951. long_running: true
  952. script: python workloads/long_running_cloud_storage.py s3://tune-cloud-tests/long_running_cloud_storage
  953. # NOTE: This smoke test is not useful to run because the point of the test
  954. # is to be long running. This is just for debugging updates to the test quickly.
  955. smoke_test:
  956. frequency: manual
  957. run:
  958. timeout: 600
  959. variations:
  960. - __suffix__: aws
  961. - __suffix__: gce
  962. env: gce
  963. frequency: manual
  964. cluster:
  965. cluster_compute: tpl_gce_1x4.yaml
  966. run:
  967. # 14 hours
  968. timeout: 50400
  969. long_running: true
  970. script: python workloads/long_running_cloud_storage.py gs://tune-cloud-tests/long_running_cloud_storage
  971. wait_for_nodes:
  972. num_nodes: 1
  973. alert: long_running_tests
  974. ########################
  975. # Tune scalability tests
  976. ########################
  977. - name: tune_scalability_bookkeeping_overhead
  978. group: Tune scalability tests
  979. working_dir: tune_tests/scalability_tests
  980. frequency: nightly
  981. team: ml
  982. cluster:
  983. byod: {}
  984. cluster_compute: tpl_1x16.yaml
  985. run:
  986. timeout: 1200
  987. script: python workloads/test_bookkeeping_overhead.py
  988. alert: tune_tests
  989. variations:
  990. - __suffix__: aws
  991. - __suffix__: gce
  992. env: gce
  993. frequency: manual
  994. cluster:
  995. cluster_compute: tpl_gce_1x16.yaml
  996. - name: tune_scalability_durable_trainable
  997. group: Tune scalability tests
  998. working_dir: tune_tests/scalability_tests
  999. frequency: nightly
  1000. team: ml
  1001. cluster:
  1002. byod: {}
  1003. cluster_compute: tpl_16x2.yaml
  1004. run:
  1005. timeout: 900
  1006. script: python workloads/test_durable_trainable.py --bucket s3://tune-cloud-tests/scalability_durable_trainable
  1007. wait_for_nodes:
  1008. num_nodes: 16
  1009. variations:
  1010. - __suffix__: aws
  1011. - __suffix__: gce
  1012. env: gce
  1013. frequency: manual
  1014. run:
  1015. timeout: 900
  1016. script: python workloads/test_durable_trainable.py --bucket gs://tune-cloud-tests/scalability_durable_trainable
  1017. wait_for_nodes:
  1018. num_nodes: 16
  1019. cluster:
  1020. cluster_compute: tpl_gce_16x2.yaml
  1021. alert: tune_tests
  1022. - name: tune_scalability_durable_multifile_checkpoints
  1023. group: Tune scalability tests
  1024. working_dir: tune_tests/scalability_tests
  1025. frequency: nightly
  1026. team: ml
  1027. cluster:
  1028. byod: {}
  1029. cluster_compute: tpl_16x2.yaml
  1030. run:
  1031. timeout: 900
  1032. script: python workloads/test_durable_multifile_checkpoints.py --bucket s3://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1033. wait_for_nodes:
  1034. num_nodes: 16
  1035. variations:
  1036. - __suffix__: aws
  1037. - __suffix__: gce
  1038. env: gce
  1039. frequency: manual
  1040. run:
  1041. timeout: 900
  1042. script: python workloads/test_durable_multifile_checkpoints.py --bucket gs://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1043. wait_for_nodes:
  1044. num_nodes: 16
  1045. cluster:
  1046. cluster_compute: tpl_gce_16x2.yaml
  1047. alert: tune_tests
  1048. - name: tune_scalability_long_running_large_checkpoints
  1049. group: Tune scalability tests
  1050. working_dir: tune_tests/scalability_tests
  1051. frequency: weekly
  1052. team: ml
  1053. cluster:
  1054. byod: {}
  1055. cluster_compute: tpl_1x32_hd.yaml
  1056. run:
  1057. timeout: 86400
  1058. script: python workloads/test_long_running_large_checkpoints.py
  1059. long_running: true
  1060. smoke_test:
  1061. frequency: nightly
  1062. run:
  1063. timeout: 3600
  1064. alert: tune_tests
  1065. variations:
  1066. - __suffix__: aws
  1067. - __suffix__: gce
  1068. env: gce
  1069. frequency: manual
  1070. smoke_test:
  1071. frequency: manual
  1072. cluster:
  1073. cluster_compute: tpl_gce_1x32_hd.yaml
  1074. - name: tune_scalability_network_overhead
  1075. group: Tune scalability tests
  1076. working_dir: tune_tests/scalability_tests
  1077. frequency: weekly
  1078. team: ml
  1079. cluster:
  1080. byod: {}
  1081. cluster_compute: tpl_100x2.yaml
  1082. run:
  1083. timeout: 750
  1084. prepare_timeout: 1200
  1085. script: python workloads/test_network_overhead.py
  1086. wait_for_nodes:
  1087. num_nodes: 100
  1088. alert: tune_tests
  1089. variations:
  1090. - __suffix__: aws
  1091. - __suffix__: smoke-test
  1092. frequency: nightly
  1093. cluster:
  1094. cluster_compute: tpl_20x2.yaml
  1095. run:
  1096. timeout: 750
  1097. prepare_timeout: 600
  1098. script: python workloads/test_network_overhead.py --smoke-test
  1099. wait_for_nodes:
  1100. num_nodes: 20
  1101. - __suffix__: gce
  1102. env: gce
  1103. frequency: manual
  1104. cluster:
  1105. cluster_compute: tpl_gce_100x2.yaml
  1106. - name: tune_scalability_result_throughput_cluster
  1107. group: Tune scalability tests
  1108. working_dir: tune_tests/scalability_tests
  1109. frequency: nightly-3x
  1110. team: ml
  1111. cluster:
  1112. byod: {}
  1113. cluster_compute: tpl_16x64.yaml
  1114. run:
  1115. timeout: 600
  1116. script: python workloads/test_result_throughput_cluster.py
  1117. wait_for_nodes:
  1118. num_nodes: 16
  1119. alert: tune_tests
  1120. variations:
  1121. - __suffix__: aws
  1122. - __suffix__: gce
  1123. env: gce
  1124. frequency: manual
  1125. cluster:
  1126. cluster_compute: tpl_gce_16x64.yaml
  1127. - name: tune_scalability_result_throughput_single_node
  1128. group: Tune scalability tests
  1129. working_dir: tune_tests/scalability_tests
  1130. frequency: nightly
  1131. team: ml
  1132. cluster:
  1133. byod: {}
  1134. cluster_compute: tpl_1x96.yaml
  1135. run:
  1136. timeout: 600
  1137. script: python workloads/test_result_throughput_single_node.py
  1138. alert: tune_tests
  1139. variations:
  1140. - __suffix__: aws
  1141. - __suffix__: gce
  1142. env: gce
  1143. frequency: manual
  1144. cluster:
  1145. cluster_compute: tpl_gce_1x96.yaml
  1146. ############################
  1147. # Tune fault tolerance tests
  1148. ############################
  1149. - name: tune_worker_fault_tolerance
  1150. group: Tune fault tolerance tests
  1151. working_dir: tune_tests/fault_tolerance_tests
  1152. stable: true
  1153. frequency: nightly-3x
  1154. team: ml
  1155. cluster:
  1156. byod: {}
  1157. cluster_compute: tpl_aws_16x1.yaml
  1158. run:
  1159. timeout: 5400
  1160. script: python workloads/test_tune_worker_fault_tolerance.py --bucket s3://tune-cloud-tests/worker_fault_tolerance
  1161. wait_for_nodes:
  1162. num_nodes: 16
  1163. # Disabled until we can kill nodes in GCE
  1164. # variations:
  1165. # - __suffix__: aws
  1166. # - __suffix__: gce
  1167. # env: gce
  1168. # frequency: manual
  1169. # run:
  1170. # timeout: 5400
  1171. # script: python workloads/test_tune_worker_fault_tolerance.py --bucket gs://tune-cloud-tests/worker_fault_tolerance
  1172. #
  1173. # wait_for_nodes:
  1174. # num_nodes: 16
  1175. # cluster:
  1176. # cluster_compute: tpl_gce_16x1.yaml
  1177. ########################
  1178. # Golden Notebook tests
  1179. ########################
  1180. - name: golden_notebook_torch_tune_serve_test
  1181. group: Golden Notebook tests
  1182. working_dir: golden_notebook_tests
  1183. frequency: nightly-3x
  1184. team: ml
  1185. cluster:
  1186. byod:
  1187. type: gpu
  1188. cluster_compute: gpu_tpl_aws.yaml
  1189. run:
  1190. timeout: 600
  1191. script: python workloads/torch_tune_serve_test.py
  1192. wait_for_nodes:
  1193. num_nodes: 2
  1194. variations:
  1195. - __suffix__: aws
  1196. - __suffix__: gce
  1197. env: gce
  1198. frequency: manual
  1199. cluster:
  1200. cluster_compute: gpu_tpl_gce.yaml
  1201. alert: default
  1202. #######################
  1203. # Long running tests
  1204. #######################
  1205. - name: long_running_actor_deaths
  1206. group: Long running tests
  1207. working_dir: long_running_tests
  1208. frequency: weekly
  1209. team: core
  1210. cluster:
  1211. byod:
  1212. pip:
  1213. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1214. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1215. runtime_env:
  1216. - RLLIB_TEST_NO_JAX_IMPORT=1
  1217. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1218. cluster_compute: tpl_cpu_1.yaml
  1219. run:
  1220. timeout: 86400
  1221. script: python workloads/actor_deaths.py
  1222. long_running: true
  1223. smoke_test:
  1224. frequency: nightly
  1225. run:
  1226. timeout: 3600
  1227. alert: long_running_tests
  1228. variations:
  1229. - __suffix__: aws
  1230. - __suffix__: gce
  1231. env: gce
  1232. frequency: manual
  1233. smoke_test:
  1234. frequency: manual
  1235. cluster:
  1236. cluster_compute: tpl_cpu_1_gce.yaml
  1237. - name: long_running_apex
  1238. group: Long running tests
  1239. working_dir: long_running_tests
  1240. stable: false
  1241. frequency: weekly
  1242. team: rllib
  1243. cluster:
  1244. byod:
  1245. type: gpu
  1246. post_build_script: byod_rllib_test.sh
  1247. runtime_env:
  1248. - RLLIB_TEST_NO_JAX_IMPORT=1
  1249. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1250. cluster_compute: tpl_cpu_3.yaml
  1251. run:
  1252. timeout: 86400
  1253. script: python workloads/apex.py
  1254. long_running: true
  1255. wait_for_nodes:
  1256. num_nodes: 3
  1257. smoke_test:
  1258. frequency: nightly
  1259. run:
  1260. timeout: 3600
  1261. alert: long_running_tests
  1262. variations:
  1263. - __suffix__: aws
  1264. - __suffix__: gce
  1265. env: gce
  1266. frequency: manual
  1267. smoke_test:
  1268. frequency: manual
  1269. run:
  1270. timeout: 3600
  1271. cluster:
  1272. cluster_compute: tpl_cpu_3_gce.yaml
  1273. - name: long_running_impala
  1274. group: Long running tests
  1275. working_dir: long_running_tests
  1276. frequency: weekly
  1277. team: rllib
  1278. cluster:
  1279. byod:
  1280. type: gpu
  1281. post_build_script: byod_rllib_test.sh
  1282. runtime_env:
  1283. - RLLIB_TEST_NO_JAX_IMPORT=1
  1284. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1285. cluster_compute: tpl_cpu_1_large.yaml
  1286. run:
  1287. timeout: 86400
  1288. script: python workloads/impala.py
  1289. long_running: true
  1290. smoke_test:
  1291. frequency: nightly
  1292. run:
  1293. timeout: 3600
  1294. alert: long_running_tests
  1295. variations:
  1296. - __suffix__: aws
  1297. - __suffix__: gce
  1298. env: gce
  1299. frequency: manual
  1300. smoke_test:
  1301. frequency: manual
  1302. run:
  1303. timeout: 3600
  1304. cluster:
  1305. cluster_compute: tpl_cpu_1_large_gce.yaml
  1306. - name: long_running_many_actor_tasks
  1307. group: Long running tests
  1308. working_dir: long_running_tests
  1309. frequency: weekly
  1310. team: core
  1311. cluster:
  1312. byod:
  1313. pip:
  1314. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1315. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1316. runtime_env:
  1317. - RLLIB_TEST_NO_JAX_IMPORT=1
  1318. cluster_compute: tpl_cpu_1.yaml
  1319. run:
  1320. timeout: 86400
  1321. script: python workloads/many_actor_tasks.py
  1322. long_running: true
  1323. smoke_test:
  1324. frequency: nightly
  1325. run:
  1326. timeout: 3600
  1327. alert: long_running_tests
  1328. variations:
  1329. - __suffix__: aws
  1330. - __suffix__: gce
  1331. env: gce
  1332. frequency: manual
  1333. smoke_test:
  1334. frequency: manual
  1335. run:
  1336. timeout: 3600
  1337. cluster:
  1338. cluster_compute: tpl_cpu_1_gce.yaml
  1339. - name: long_running_many_drivers
  1340. group: Long running tests
  1341. working_dir: long_running_tests
  1342. frequency: weekly
  1343. team: core
  1344. cluster:
  1345. byod:
  1346. pip:
  1347. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1348. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1349. runtime_env:
  1350. - RLLIB_TEST_NO_JAX_IMPORT=1
  1351. cluster_compute: tpl_cpu_4.yaml
  1352. run:
  1353. timeout: 86400
  1354. script: python workloads/many_drivers.py --iteration-num=4000
  1355. long_running: true
  1356. wait_for_nodes:
  1357. num_nodes: 4
  1358. smoke_test:
  1359. frequency: nightly
  1360. run:
  1361. timeout: 3600
  1362. alert: long_running_tests
  1363. variations:
  1364. - __suffix__: aws
  1365. - __suffix__: gce
  1366. env: gce
  1367. frequency: manual
  1368. smoke_test:
  1369. frequency: manual
  1370. run:
  1371. timeout: 3600
  1372. cluster:
  1373. cluster_compute: tpl_cpu_4_gce.yaml
  1374. - name: long_running_many_ppo
  1375. group: Long running tests
  1376. working_dir: long_running_tests
  1377. stable: false
  1378. frequency: weekly
  1379. team: ml
  1380. cluster:
  1381. byod:
  1382. type: gpu
  1383. post_build_script: byod_rllib_test.sh
  1384. runtime_env:
  1385. - RLLIB_TEST_NO_JAX_IMPORT=1
  1386. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1387. cluster_compute: many_ppo.yaml
  1388. run:
  1389. timeout: 86400
  1390. script: python workloads/many_ppo.py
  1391. long_running: true
  1392. wait_for_nodes:
  1393. num_nodes: 1
  1394. smoke_test:
  1395. frequency: nightly
  1396. run:
  1397. timeout: 3600
  1398. alert: long_running_tests
  1399. variations:
  1400. - __suffix__: aws
  1401. - __suffix__: gce
  1402. env: gce
  1403. frequency: manual
  1404. smoke_test:
  1405. frequency: manual
  1406. run:
  1407. timeout: 3600
  1408. cluster:
  1409. cluster_compute: many_ppo_gce.yaml
  1410. - name: long_running_many_tasks
  1411. group: Long running tests
  1412. working_dir: long_running_tests
  1413. frequency: weekly
  1414. team: core
  1415. cluster:
  1416. byod:
  1417. pip:
  1418. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1419. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1420. runtime_env:
  1421. - RLLIB_TEST_NO_JAX_IMPORT=1
  1422. cluster_compute: tpl_cpu_1.yaml
  1423. run:
  1424. timeout: 86400
  1425. script: python workloads/many_tasks.py
  1426. long_running: true
  1427. smoke_test:
  1428. frequency: nightly
  1429. run:
  1430. timeout: 3600
  1431. alert: long_running_tests
  1432. variations:
  1433. - __suffix__: aws
  1434. - __suffix__: gce
  1435. env: gce
  1436. frequency: manual
  1437. smoke_test:
  1438. frequency: manual
  1439. run:
  1440. timeout: 3600
  1441. cluster:
  1442. cluster_compute: tpl_cpu_1_gce.yaml
  1443. - name: long_running_many_tasks_serialized_ids
  1444. group: Long running tests
  1445. working_dir: long_running_tests
  1446. frequency: weekly
  1447. team: core
  1448. cluster:
  1449. byod:
  1450. pip:
  1451. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1452. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1453. runtime_env:
  1454. - RLLIB_TEST_NO_JAX_IMPORT=1
  1455. cluster_compute: tpl_cpu_1.yaml
  1456. run:
  1457. timeout: 86400
  1458. script: python workloads/many_tasks_serialized_ids.py
  1459. long_running: true
  1460. smoke_test:
  1461. frequency: nightly
  1462. run:
  1463. timeout: 3600
  1464. alert: long_running_tests
  1465. variations:
  1466. - __suffix__: aws
  1467. - __suffix__: gce
  1468. env: gce
  1469. frequency: manual
  1470. smoke_test:
  1471. frequency: manual
  1472. run:
  1473. timeout: 3600
  1474. cluster:
  1475. cluster_compute: tpl_cpu_1_gce.yaml
  1476. - name: long_running_node_failures
  1477. group: Long running tests
  1478. working_dir: long_running_tests
  1479. frequency: weekly
  1480. team: core
  1481. cluster:
  1482. byod:
  1483. pip:
  1484. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1485. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1486. runtime_env:
  1487. - RLLIB_TEST_NO_JAX_IMPORT=1
  1488. cluster_compute: tpl_cpu_1.yaml
  1489. run:
  1490. timeout: 86400
  1491. script: python workloads/node_failures.py
  1492. long_running: true
  1493. smoke_test:
  1494. frequency: nightly
  1495. run:
  1496. timeout: 3600
  1497. alert: long_running_tests
  1498. variations:
  1499. - __suffix__: aws
  1500. - __suffix__: gce
  1501. env: gce
  1502. frequency: manual
  1503. smoke_test:
  1504. frequency: manual
  1505. run:
  1506. timeout: 3600
  1507. cluster:
  1508. cluster_compute: tpl_cpu_1_gce.yaml
  1509. - name: long_running_serve
  1510. group: Long running tests
  1511. working_dir: long_running_tests
  1512. frequency: weekly
  1513. team: serve
  1514. cluster:
  1515. byod:
  1516. pip:
  1517. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1518. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1519. runtime_env:
  1520. - RLLIB_TEST_NO_JAX_IMPORT=1
  1521. cluster_compute: tpl_cpu_1_large.yaml
  1522. run:
  1523. timeout: 86400
  1524. script: python workloads/serve.py
  1525. long_running: true
  1526. smoke_test:
  1527. frequency: nightly
  1528. run:
  1529. timeout: 3600
  1530. alert: long_running_tests
  1531. variations:
  1532. - __suffix__: aws
  1533. - __suffix__: gce
  1534. env: gce
  1535. frequency: manual
  1536. smoke_test:
  1537. frequency: manual
  1538. run:
  1539. timeout: 3600
  1540. cluster:
  1541. cluster_compute: tpl_cpu_1_gce.yaml
  1542. - name: long_running_serve_failure
  1543. group: Long running tests
  1544. working_dir: long_running_tests
  1545. stable: true
  1546. frequency: weekly
  1547. team: serve
  1548. cluster:
  1549. byod:
  1550. pip:
  1551. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1552. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1553. runtime_env:
  1554. - RLLIB_TEST_NO_JAX_IMPORT=1
  1555. cluster_compute: tpl_cpu_1_c5.yaml
  1556. run:
  1557. timeout: 86400
  1558. script: python workloads/serve_failure.py
  1559. long_running: true
  1560. smoke_test:
  1561. frequency: nightly
  1562. run:
  1563. timeout: 600
  1564. alert: long_running_tests
  1565. variations:
  1566. - __suffix__: aws
  1567. - __suffix__: gce
  1568. env: gce
  1569. frequency: manual
  1570. smoke_test:
  1571. frequency: manual
  1572. run:
  1573. timeout: 86400
  1574. cluster:
  1575. cluster_compute: tpl_cpu_1_c5_gce.yaml
  1576. - name: long_running_many_jobs
  1577. group: Long running tests
  1578. working_dir: long_running_tests
  1579. stable: true
  1580. frequency: weekly
  1581. team: serve
  1582. cluster:
  1583. byod:
  1584. pip:
  1585. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1586. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1587. runtime_env:
  1588. - RLLIB_TEST_NO_JAX_IMPORT=1
  1589. cluster_compute: tpl_cpu_1.yaml
  1590. run:
  1591. timeout: 86400
  1592. script: python workloads/long_running_many_jobs.py --num-clients=1
  1593. long_running: true
  1594. smoke_test:
  1595. frequency: nightly
  1596. run:
  1597. timeout: 1800
  1598. alert: long_running_tests
  1599. variations:
  1600. - __suffix__: aws
  1601. - __suffix__: gce
  1602. env: gce
  1603. frequency: manual
  1604. smoke_test:
  1605. frequency: manual
  1606. run:
  1607. timeout: 3600
  1608. cluster:
  1609. cluster_compute: tpl_cpu_1_gce.yaml
  1610. - name: long_running_distributed_pytorch_pbt_failure
  1611. group: Long running tests
  1612. working_dir: long_running_distributed_tests
  1613. frequency: weekly
  1614. team: ml
  1615. cluster:
  1616. byod:
  1617. type: gpu
  1618. cluster_compute: compute_tpl.yaml
  1619. run:
  1620. timeout: 86400
  1621. script: python workloads/pytorch_pbt_failure.py
  1622. long_running: true
  1623. smoke_test:
  1624. frequency: manual
  1625. run:
  1626. timeout: 3600
  1627. alert: long_running_tests
  1628. variations:
  1629. - __suffix__: aws
  1630. - __suffix__: gce
  1631. env: gce
  1632. frequency: manual
  1633. smoke_test:
  1634. frequency: manual
  1635. run:
  1636. timeout: 3600
  1637. cluster:
  1638. cluster_compute: compute_tpl_gce.yaml
  1639. ########################
  1640. # Jobs tests
  1641. ########################
  1642. - name: jobs_basic_local_working_dir
  1643. group: Jobs tests
  1644. working_dir: jobs_tests
  1645. frequency: nightly
  1646. team: serve
  1647. cluster:
  1648. byod:
  1649. type: gpu
  1650. cluster_compute: compute_tpl_4_xlarge.yaml
  1651. run:
  1652. timeout: 600
  1653. script: python workloads/jobs_basic.py --working-dir "workloads"
  1654. wait_for_nodes:
  1655. num_nodes: 4
  1656. alert: default
  1657. variations:
  1658. - __suffix__: aws
  1659. - __suffix__: gce
  1660. env: gce
  1661. frequency: manual
  1662. cluster:
  1663. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  1664. - name: jobs_basic_remote_working_dir
  1665. group: Jobs tests
  1666. working_dir: jobs_tests
  1667. frequency: nightly
  1668. team: serve
  1669. cluster:
  1670. byod:
  1671. type: gpu
  1672. cluster_compute: compute_tpl_4_xlarge.yaml
  1673. run:
  1674. timeout: 600
  1675. script: python workloads/jobs_basic.py --working-dir "https://github.com/anyscale/job-services-cuj-examples/archive/refs/heads/main.zip"
  1676. wait_for_nodes:
  1677. num_nodes: 4
  1678. alert: default
  1679. variations:
  1680. - __suffix__: aws
  1681. - __suffix__: gce
  1682. env: gce
  1683. frequency: manual
  1684. cluster:
  1685. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  1686. - name: jobs_remote_multi_node
  1687. group: Jobs tests
  1688. team: serve
  1689. frequency: nightly
  1690. working_dir: jobs_tests
  1691. cluster:
  1692. byod:
  1693. type: gpu
  1694. cluster_compute: compute_tpl_4_xlarge.yaml
  1695. run:
  1696. timeout: 600
  1697. script: python workloads/jobs_remote_multi_node.py
  1698. wait_for_nodes:
  1699. num_nodes: 4
  1700. variations:
  1701. - __suffix__: aws
  1702. - __suffix__: gce
  1703. env: gce
  1704. frequency: manual
  1705. cluster:
  1706. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  1707. - name: jobs_check_cuda_available
  1708. group: Jobs tests
  1709. team: serve
  1710. frequency: nightly
  1711. working_dir: jobs_tests
  1712. cluster:
  1713. byod:
  1714. type: gpu
  1715. cluster_compute: compute_tpl_gpu_node.yaml
  1716. run:
  1717. timeout: 600
  1718. script: python workloads/jobs_check_cuda_available.py
  1719. wait_for_nodes:
  1720. num_nodes: 2
  1721. variations:
  1722. - __suffix__: aws
  1723. - __suffix__: gce
  1724. env: gce
  1725. frequency: manual
  1726. cluster:
  1727. cluster_compute: compute_tpl_gce_gpu_node.yaml
  1728. - name: jobs_specify_num_gpus
  1729. group: Jobs tests
  1730. team: serve
  1731. frequency: nightly
  1732. working_dir: jobs_tests
  1733. cluster:
  1734. byod:
  1735. type: gpu
  1736. cluster_compute: compute_tpl_gpu_worker.yaml
  1737. run:
  1738. timeout: 600
  1739. script: python workloads/jobs_specify_num_gpus.py --working-dir "workloads"
  1740. wait_for_nodes:
  1741. num_nodes: 2
  1742. variations:
  1743. - __suffix__: aws
  1744. - __suffix__: gce
  1745. env: gce
  1746. frequency: manual
  1747. cluster:
  1748. cluster_compute: compute_tpl_gce_gpu_worker.yaml
  1749. ########################
  1750. # Runtime env tests
  1751. ########################
  1752. - name: runtime_env_rte_many_tasks_actors
  1753. group: Runtime env tests
  1754. working_dir: runtime_env_tests
  1755. frequency: nightly
  1756. team: core
  1757. cluster:
  1758. byod: {}
  1759. cluster_compute: rte_small.yaml
  1760. run:
  1761. timeout: 600
  1762. script: python workloads/rte_many_tasks_actors.py
  1763. wait_for_nodes:
  1764. num_nodes: 4
  1765. alert: default
  1766. variations:
  1767. - __suffix__: aws
  1768. - __suffix__: gce
  1769. env: gce
  1770. frequency: manual
  1771. cluster:
  1772. cluster_compute: rte_gce_small.yaml
  1773. - name: runtime_env_wheel_urls
  1774. group: Runtime env tests
  1775. working_dir: runtime_env_tests
  1776. frequency: nightly
  1777. team: core
  1778. cluster:
  1779. byod: {}
  1780. cluster_compute: rte_minimal.yaml
  1781. run:
  1782. timeout: 9000
  1783. script: python workloads/wheel_urls.py
  1784. wait_for_nodes:
  1785. num_nodes: 1
  1786. alert: default
  1787. variations:
  1788. - __suffix__: aws
  1789. - __suffix__: gce
  1790. env: gce
  1791. frequency: manual
  1792. cluster:
  1793. cluster_compute: rte_gce_minimal.yaml
  1794. # It seems like the consensus is that this should be tested in CI, and not in a nightly test.
  1795. # - name: runtime_env_rte_ray_client
  1796. # group: Runtime env tests
  1797. # working_dir: runtime_env_tests
  1798. # frequency: nightly
  1799. # team: core
  1800. # cluster:
  1801. # cluster_compute: rte_minimal.yaml
  1802. # run:
  1803. # timeout: 600
  1804. # script: python workloads/rte_ray_client.py
  1805. # wait_for_nodes:
  1806. # num_nodes: 1
  1807. # alert: default
  1808. ########################
  1809. # Serve tests
  1810. ########################
  1811. - name: serve_scale_replicas
  1812. group: Serve tests
  1813. working_dir: serve_tests
  1814. frequency: nightly
  1815. team: serve
  1816. cluster:
  1817. byod: {}
  1818. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  1819. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1820. run:
  1821. timeout: 7200
  1822. long_running: false
  1823. script: python workloads/replica_scalability.py
  1824. alert: default
  1825. variations:
  1826. - __suffix__: aws
  1827. - name: serve_multi_deployment_1k_noop_replica
  1828. group: Serve tests
  1829. working_dir: serve_tests
  1830. frequency: nightly
  1831. team: serve
  1832. cluster:
  1833. byod: {}
  1834. cluster_compute: compute_tpl_32_cpu.yaml
  1835. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1836. run:
  1837. timeout: 7200
  1838. long_running: false
  1839. script: python workloads/multi_deployment_1k_noop_replica.py
  1840. alert: default
  1841. variations:
  1842. - __suffix__: aws
  1843. - __suffix__: aws.py311
  1844. python: "3.11"
  1845. - __suffix__: gce
  1846. env: gce
  1847. frequency: manual
  1848. cluster:
  1849. cluster_compute: compute_tpl_32_cpu_gce.yaml
  1850. - name: serve_autoscaling_load_test
  1851. group: Serve tests
  1852. working_dir: serve_tests
  1853. frequency: nightly
  1854. team: serve
  1855. cluster:
  1856. byod:
  1857. type: gpu
  1858. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  1859. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1860. run:
  1861. timeout: 7200
  1862. long_running: false
  1863. script: python workloads/autoscaling_load_test.py
  1864. alert: default
  1865. variations:
  1866. - __suffix__: aws
  1867. - name: serve_serve_micro_benchmark
  1868. group: Serve tests
  1869. working_dir: serve_tests
  1870. frequency: nightly
  1871. team: serve
  1872. cluster:
  1873. byod: {}
  1874. cluster_compute: compute_tpl_single_node.yaml
  1875. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1876. run:
  1877. timeout: 7200
  1878. long_running: false
  1879. script: python workloads/serve_micro_benchmark.py
  1880. alert: default
  1881. variations:
  1882. - __suffix__: aws
  1883. - __suffix__: gce
  1884. env: gce
  1885. frequency: manual
  1886. cluster:
  1887. cluster_compute: compute_tpl_single_node_gce.yaml
  1888. - name: serve_microbenchmarks
  1889. group: Serve tests
  1890. working_dir: serve_tests
  1891. frequency: nightly
  1892. team: serve
  1893. cluster:
  1894. byod: {}
  1895. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  1896. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1897. run:
  1898. timeout: 7200
  1899. long_running: false
  1900. script: python workloads/microbenchmarks.py
  1901. alert: default
  1902. variations:
  1903. - __suffix__: aws
  1904. - __suffix__: gce
  1905. env: gce
  1906. frequency: manual
  1907. cluster:
  1908. cluster_compute: compute_tpl_single_node_gce.yaml
  1909. - name: serve_resnet_benchmark
  1910. group: Serve tests
  1911. working_dir: serve_tests
  1912. frequency: nightly
  1913. team: serve
  1914. cluster:
  1915. byod:
  1916. type: gpu
  1917. cluster_compute: compute_tpl_gpu_node.yaml
  1918. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1919. run:
  1920. timeout: 7200
  1921. long_running: false
  1922. script: python workloads/serve_resnet_benchmark.py --gpu-env
  1923. alert: default
  1924. variations:
  1925. - __suffix__: aws
  1926. - __suffix__: gce
  1927. env: gce
  1928. frequency: manual
  1929. cluster:
  1930. cluster_compute: compute_tpl_gpu_node_gce.yaml
  1931. ########################
  1932. # Train tests
  1933. ########################
  1934. - name: train_horovod_multi_node_test
  1935. group: Train tests
  1936. working_dir: train_tests/horovod
  1937. frequency: nightly
  1938. team: ml
  1939. cluster:
  1940. byod:
  1941. type: gpu
  1942. post_build_script: byod_horovod_test.sh
  1943. cluster_compute: compute_tpl_aws.yaml
  1944. run:
  1945. timeout: 3000
  1946. script: python train_horovod_multi_node_test.py
  1947. wait_for_nodes:
  1948. num_nodes: 2
  1949. variations:
  1950. - __suffix__: aws
  1951. - __suffix__: gce
  1952. env: gce
  1953. frequency: manual
  1954. cluster:
  1955. cluster_compute: compute_tpl_gce.yaml
  1956. alert: default
  1957. - name: train_multinode_persistence
  1958. group: Train tests
  1959. working_dir: train_tests/multinode_persistence
  1960. frequency: nightly
  1961. team: ml
  1962. cluster:
  1963. byod:
  1964. post_build_script: byod_train_persistence_test.sh
  1965. cluster_compute: compute_aws.yaml
  1966. run:
  1967. timeout: 3000
  1968. script: pytest -v test_persistence.py -s
  1969. wait_for_nodes:
  1970. num_nodes: 4
  1971. variations:
  1972. - __suffix__: aws
  1973. - __suffix__: gce
  1974. env: gce
  1975. frequency: manual
  1976. cluster:
  1977. cluster_compute: compute_gce.yaml
  1978. alert: default
  1979. - name: train_colocate_trainer
  1980. group: Train tests
  1981. working_dir: train_tests/colocate_trainer
  1982. frequency: nightly
  1983. team: ml
  1984. cluster:
  1985. byod: {}
  1986. cluster_compute: compute_aws.yaml
  1987. run:
  1988. timeout: 3000
  1989. script: pytest -v test_colocate_trainer.py -s
  1990. wait_for_nodes:
  1991. num_nodes: 4
  1992. alert: default
  1993. - name: xgboost_train_batch_inference_benchmark_10G
  1994. group: Train tests
  1995. working_dir: train_tests/xgboost_lightgbm
  1996. frequency: nightly
  1997. team: ml
  1998. cluster:
  1999. byod:
  2000. type: gpu
  2001. cluster_compute: compute_aws_1worker.yaml
  2002. run:
  2003. timeout: 36000
  2004. script: python train_batch_inference_benchmark.py "xgboost" --size=10G
  2005. wait_for_nodes:
  2006. num_nodes: 2
  2007. variations:
  2008. - __suffix__: aws
  2009. - __suffix__: gce
  2010. env: gce
  2011. frequency: manual
  2012. cluster:
  2013. cluster_compute: compute_gce_1worker.yaml
  2014. smoke_test:
  2015. frequency: manual
  2016. run:
  2017. timeout: 1800
  2018. alert: default
  2019. - name: xgboost_train_batch_inference_benchmark_100G
  2020. group: Train tests
  2021. working_dir: train_tests/xgboost_lightgbm
  2022. frequency: nightly-3x
  2023. team: ml
  2024. cluster:
  2025. byod:
  2026. type: gpu
  2027. cluster_compute: compute_aws_10workers.yaml
  2028. run:
  2029. timeout: 36000
  2030. script: python train_batch_inference_benchmark.py "xgboost" --size=100G
  2031. wait_for_nodes:
  2032. num_nodes: 11
  2033. variations:
  2034. - __suffix__: aws
  2035. - __suffix__: gce
  2036. env: gce
  2037. frequency: manual
  2038. cluster:
  2039. cluster_compute: compute_gce_10workers.yaml
  2040. smoke_test:
  2041. frequency: manual
  2042. run:
  2043. timeout: 1800
  2044. alert: default
  2045. - name: lightgbm_train_batch_inference_benchmark_10G
  2046. group: Train tests
  2047. working_dir: train_tests/xgboost_lightgbm
  2048. frequency: nightly
  2049. team: ml
  2050. cluster:
  2051. byod:
  2052. type: gpu
  2053. cluster_compute: compute_aws_1worker.yaml
  2054. run:
  2055. timeout: 36000
  2056. script: python train_batch_inference_benchmark.py "lightgbm" --size=10G
  2057. wait_for_nodes:
  2058. num_nodes: 2
  2059. variations:
  2060. - __suffix__: aws
  2061. - __suffix__: gce
  2062. env: gce
  2063. frequency: manual
  2064. cluster:
  2065. cluster_compute: compute_gce_1worker.yaml
  2066. smoke_test:
  2067. frequency: manual
  2068. run:
  2069. timeout: 1800
  2070. alert: default
  2071. - name: lightgbm_train_batch_inference_benchmark_100G
  2072. group: Train tests
  2073. working_dir: train_tests/xgboost_lightgbm
  2074. frequency: nightly-3x
  2075. team: ml
  2076. cluster:
  2077. byod:
  2078. type: gpu
  2079. cluster_compute: compute_aws_10workers.yaml
  2080. run:
  2081. timeout: 36000
  2082. script: python train_batch_inference_benchmark.py "lightgbm" --size=100G
  2083. wait_for_nodes:
  2084. num_nodes: 11
  2085. variations:
  2086. - __suffix__: aws
  2087. - __suffix__: gce
  2088. env: gce
  2089. frequency: manual
  2090. cluster:
  2091. cluster_compute: compute_gce_10workers.yaml
  2092. smoke_test:
  2093. frequency: manual
  2094. run:
  2095. timeout: 1800
  2096. alert: default
  2097. ########################
  2098. # RLlib tests
  2099. ########################
  2100. # ----------------------------------------------------------
  2101. # Checkpointing with RLModule and Learner APIs
  2102. # ----------------------------------------------------------
  2103. - name: rllib_learner_group_checkpointing_multinode
  2104. group: RLlib tests
  2105. working_dir: rllib_tests
  2106. frequency: nightly
  2107. team: rllib
  2108. stable: False
  2109. cluster:
  2110. byod:
  2111. type: gpu
  2112. post_build_script: byod_rllib_test.sh
  2113. runtime_env:
  2114. - RLLIB_TEST_NO_JAX_IMPORT=1
  2115. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2116. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2117. run:
  2118. timeout: 3600
  2119. script: pytest checkpointing_tests/test_learner_group_checkpointing.py
  2120. wait_for_nodes:
  2121. num_nodes: 2
  2122. alert: default
  2123. variations:
  2124. - __suffix__: aws
  2125. - __suffix__: gce
  2126. env: gce
  2127. frequency: manual
  2128. cluster:
  2129. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2130. - name: rllib_learner_e2e_module_loading
  2131. group: RLlib tests
  2132. working_dir: rllib_tests
  2133. stable: false
  2134. frequency: nightly
  2135. team: rllib
  2136. cluster:
  2137. byod:
  2138. type: gpu
  2139. post_build_script: byod_rllib_test.sh
  2140. runtime_env:
  2141. - RLLIB_TEST_NO_JAX_IMPORT=1
  2142. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2143. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2144. run:
  2145. timeout: 3600
  2146. script: pytest checkpointing_tests/test_e2e_rl_module_restore.py
  2147. wait_for_nodes:
  2148. num_nodes: 2
  2149. alert: default
  2150. variations:
  2151. - __suffix__: aws
  2152. - __suffix__: gce
  2153. env: gce
  2154. frequency: manual
  2155. cluster:
  2156. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2157. # ----------------------------------------------------------
  2158. # Learning and benchmarking tests
  2159. # ----------------------------------------------------------
  2160. # --------------------------
  2161. # DreamerV3
  2162. # --------------------------
  2163. # TODO (sven): Move algo and this test to pytorch
  2164. - name: rllib_learning_tests_pong_dreamerv3_tf2
  2165. group: RLlib tests
  2166. working_dir: rllib_tests
  2167. stable: false
  2168. frequency: weekly
  2169. team: rllib
  2170. cluster:
  2171. byod:
  2172. type: gpu
  2173. post_build_script: byod_rllib_dreamerv3_test.sh
  2174. runtime_env:
  2175. - RLLIB_TEST_NO_JAX_IMPORT=1
  2176. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2177. cluster_compute: 1gpu_4cpus.yaml
  2178. run:
  2179. timeout: 43200 # 12h
  2180. script: python learning_tests/tuned_examples/dreamerv3/atari_100k.py --framework=tf2 --env=ALE/Pong-v5 --num-gpus=1 --stop-reward=15.0 --as-release-test
  2181. alert: default
  2182. variations:
  2183. - __suffix__: aws
  2184. - __suffix__: gce
  2185. env: gce
  2186. frequency: manual
  2187. cluster:
  2188. cluster_compute: 1gpu_4cpus_gce.yaml
  2189. # --------------------------
  2190. # PPO
  2191. # --------------------------
  2192. - name: rllib_learning_tests_pong_ppo_torch
  2193. group: RLlib tests
  2194. working_dir: rllib_tests
  2195. stable: true
  2196. frequency: nightly
  2197. team: rllib
  2198. cluster:
  2199. byod:
  2200. type: gpu
  2201. post_build_script: byod_rllib_test.sh
  2202. runtime_env:
  2203. - RLLIB_TEST_NO_JAX_IMPORT=1
  2204. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2205. cluster_compute: 8gpus_96cpus.yaml
  2206. run:
  2207. timeout: 1200
  2208. script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env=ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --stop-reward=20.0 --as-release-test
  2209. alert: default
  2210. variations:
  2211. - __suffix__: aws
  2212. - __suffix__: gce
  2213. env: gce
  2214. frequency: manual
  2215. cluster:
  2216. cluster_compute: 8gpus_96cpus_gce.yaml
  2217. ########################
  2218. # Core Nightly Tests
  2219. ########################
  2220. - name: shuffle_100gb
  2221. group: core-multi-test
  2222. working_dir: nightly_tests
  2223. frequency: nightly
  2224. team: core
  2225. cluster:
  2226. byod:
  2227. runtime_env:
  2228. - RAY_worker_killing_policy=retriable_lifo
  2229. cluster_compute: shuffle/shuffle_compute_multi.yaml
  2230. run:
  2231. timeout: 3000
  2232. script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
  2233. wait_for_nodes:
  2234. num_nodes: 4
  2235. variations:
  2236. - __suffix__: aws
  2237. - __suffix__: gce
  2238. env: gce
  2239. frequency: manual
  2240. cluster:
  2241. cluster_compute: shuffle/shuffle_compute_multi_gce.yaml
  2242. - name: stress_test_placement_group
  2243. group: core-multi-test
  2244. working_dir: nightly_tests
  2245. frequency: nightly
  2246. team: core
  2247. cluster:
  2248. byod: {}
  2249. cluster_compute: stress_tests/placement_group_tests_compute.yaml
  2250. run:
  2251. timeout: 7200
  2252. script: python stress_tests/test_placement_group.py
  2253. variations:
  2254. - __suffix__: aws
  2255. - __suffix__: gce
  2256. env: gce
  2257. frequency: manual
  2258. cluster:
  2259. cluster_compute: stress_tests/placement_group_tests_compute_gce.yaml
  2260. - name: decision_tree_autoscaling_20_runs
  2261. group: core-multi-test
  2262. working_dir: nightly_tests
  2263. frequency: nightly
  2264. team: core
  2265. cluster:
  2266. byod: {}
  2267. cluster_compute: decision_tree/autoscaling_compute.yaml
  2268. run:
  2269. timeout: 9600
  2270. script: python decision_tree/cart_with_tree.py --concurrency=20
  2271. variations:
  2272. - __suffix__: aws
  2273. - __suffix__: gce
  2274. env: gce
  2275. frequency: manual
  2276. cluster:
  2277. cluster_compute: decision_tree/autoscaling_compute_gce.yaml
  2278. - name: autoscaling_shuffle_1tb_1000_partitions
  2279. group: core-multi-test
  2280. working_dir: nightly_tests
  2281. frequency: nightly
  2282. team: core
  2283. cluster:
  2284. byod:
  2285. runtime_env:
  2286. - RAY_worker_killing_policy=retriable_lifo
  2287. cluster_compute: shuffle/shuffle_compute_autoscaling.yaml
  2288. run:
  2289. timeout: 4000
  2290. script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
  2291. --no-streaming
  2292. variations:
  2293. - __suffix__: aws
  2294. - __suffix__: gce
  2295. env: gce
  2296. frequency: manual
  2297. cluster:
  2298. cluster_compute: shuffle/shuffle_compute_autoscaling_gce.yaml
  2299. - name: microbenchmark
  2300. group: core-daily-test
  2301. team: core
  2302. frequency: nightly
  2303. working_dir: microbenchmark
  2304. cluster:
  2305. byod: {}
  2306. cluster_compute: tpl_64.yaml
  2307. run:
  2308. timeout: 1800
  2309. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
  2310. variations:
  2311. - __suffix__: aws
  2312. repeated_run: 5
  2313. - __suffix__: gce
  2314. env: gce
  2315. frequency: manual
  2316. cluster:
  2317. cluster_compute: tpl_64_gce.yaml
  2318. - __suffix__: aws.py311
  2319. frequency: weekly
  2320. python: "3.11"
  2321. - name: microbenchmark_unstable
  2322. group: core-daily-test
  2323. team: core
  2324. frequency: nightly
  2325. working_dir: microbenchmark
  2326. stable: false
  2327. cluster:
  2328. byod: {}
  2329. cluster_compute: tpl_64.yaml
  2330. run:
  2331. timeout: 1800
  2332. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py --experimental
  2333. - name: microbenchmark_gpu_unstable
  2334. group: core-daily-test
  2335. team: core
  2336. frequency: nightly
  2337. working_dir: microbenchmark
  2338. stable: false
  2339. cluster:
  2340. byod:
  2341. type: gpu
  2342. cluster_compute: experimental/compute_gpu_2_aws.yaml
  2343. run:
  2344. timeout: 1800
  2345. script: python experimental/accelerated_dag_gpu_microbenchmark.py
  2346. - name: benchmark_worker_startup
  2347. group: core-daily-test
  2348. team: core
  2349. frequency: nightly
  2350. working_dir: benchmark-worker-startup
  2351. stable: false
  2352. cluster:
  2353. byod:
  2354. type: gpu
  2355. cluster_compute: only_head_node_1gpu_64cpu.yaml
  2356. run:
  2357. timeout: 7200
  2358. script: python benchmark_worker_startup.py
  2359. --num_cpus_in_cluster 64
  2360. --num_gpus_in_cluster 64
  2361. --num_tasks_or_actors_per_run 64
  2362. --num_measurements_per_configuration 5
  2363. variations:
  2364. - __suffix__: aws
  2365. - __suffix__: gce
  2366. env: gce
  2367. frequency: manual
  2368. cluster:
  2369. cluster_compute: only_head_node_1gpu_64cpu_gce.yaml
  2370. - name: dask_on_ray_100gb_sort
  2371. group: core-daily-test
  2372. working_dir: nightly_tests
  2373. frequency: nightly
  2374. team: core
  2375. # https://github.com/ray-project/ray/issues/39165
  2376. stable: false
  2377. cluster:
  2378. byod:
  2379. runtime_env:
  2380. - RAY_worker_killing_policy=retriable_lifo
  2381. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml
  2382. run:
  2383. timeout: 7200
  2384. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions
  2385. 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
  2386. variations:
  2387. - __suffix__: aws
  2388. - __suffix__: aws.py311
  2389. frequency: weekly
  2390. python: "3.11"
  2391. - __suffix__: gce
  2392. env: gce
  2393. frequency: manual
  2394. cluster:
  2395. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template_gce.yaml
  2396. - name: dask_on_ray_large_scale_test_spilling
  2397. group: core-daily-test
  2398. working_dir: nightly_tests
  2399. frequency: nightly
  2400. team: data
  2401. cluster:
  2402. byod:
  2403. runtime_env:
  2404. - RAY_worker_killing_policy=retriable_lifo
  2405. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  2406. run:
  2407. timeout: 7200
  2408. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  2409. 70 --error_rate 0 --data_save_path /tmp/ray
  2410. wait_for_nodes:
  2411. num_nodes: 21
  2412. smoke_test:
  2413. frequency: nightly
  2414. cluster:
  2415. cluster_compute: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
  2416. run:
  2417. timeout: 7200
  2418. script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb
  2419. 70 --error_rate 0 --data_save_path /tmp/ray
  2420. wait_for_nodes:
  2421. num_nodes: 5
  2422. - name: stress_test_state_api_scale
  2423. group: core-daily-test
  2424. working_dir: nightly_tests
  2425. stable: false
  2426. frequency: nightly
  2427. team: core
  2428. cluster:
  2429. byod:
  2430. runtime_env:
  2431. - RAY_MAX_LIMIT_FROM_API_SERVER=1000000000
  2432. - RAY_MAX_LIMIT_FROM_DATA_SOURCE=1000000000
  2433. cluster_compute: stress_tests/stress_tests_compute_large.yaml
  2434. run:
  2435. timeout: 4200
  2436. script: python stress_tests/test_state_api_scale.py
  2437. wait_for_nodes:
  2438. num_nodes: 7
  2439. smoke_test:
  2440. frequency: nightly
  2441. cluster:
  2442. app_config: stress_tests/state_api_app_config.yaml
  2443. cluster_compute: stress_tests/smoke_test_compute.yaml
  2444. run:
  2445. timeout: 3600
  2446. wait_for_nodes:
  2447. num_nodes: 5
  2448. script: python stress_tests/test_state_api_scale.py --smoke-test
  2449. variations:
  2450. - __suffix__: aws
  2451. - __suffix__: aws.py311
  2452. frequency: manual
  2453. python: "3.11"
  2454. smoke_test:
  2455. frequency: nightly-3x
  2456. - __suffix__: gce
  2457. env: gce
  2458. frequency: manual
  2459. cluster:
  2460. cluster_compute: stress_tests/stress_tests_compute_large_gce.yaml
  2461. smoke_test:
  2462. frequency: manual
  2463. - name: shuffle_20gb_with_state_api
  2464. group: core-daily-test
  2465. working_dir: nightly_tests
  2466. frequency: nightly
  2467. team: core
  2468. cluster:
  2469. byod:
  2470. runtime_env:
  2471. - RAY_MAX_LIMIT_FROM_API_SERVER=1000000000
  2472. - RAY_MAX_LIMIT_FROM_DATA_SOURCE=1000000000
  2473. cluster_compute: shuffle/shuffle_compute_single.yaml
  2474. run:
  2475. timeout: 1000
  2476. script: python stress_tests/test_state_api_with_other_tests.py
  2477. nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
  2478. variations:
  2479. - __suffix__: aws
  2480. - __suffix__: gce
  2481. env: gce
  2482. frequency: manual
  2483. cluster:
  2484. cluster_compute: shuffle/shuffle_compute_single_gce.yaml
  2485. - name: stress_test_many_tasks
  2486. group: core-daily-test
  2487. working_dir: nightly_tests
  2488. frequency: nightly
  2489. team: core
  2490. cluster:
  2491. byod: {}
  2492. cluster_compute: stress_tests/stress_tests_compute.yaml
  2493. run:
  2494. timeout: 14400
  2495. wait_for_nodes:
  2496. num_nodes: 101
  2497. script: python stress_tests/test_many_tasks.py
  2498. smoke_test:
  2499. frequency: nightly
  2500. cluster:
  2501. app_config: stress_tests/stress_tests_app_config.yaml
  2502. cluster_compute: stress_tests/smoke_test_compute.yaml
  2503. run:
  2504. timeout: 3600
  2505. wait_for_nodes:
  2506. num_nodes: 5
  2507. script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
  2508. variations:
  2509. - __suffix__: aws
  2510. - __suffix__: gce
  2511. env: gce
  2512. frequency: manual
  2513. cluster:
  2514. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  2515. smoke_test:
  2516. frequency: manual
  2517. - name: stress_test_dead_actors
  2518. group: core-daily-test
  2519. working_dir: nightly_tests
  2520. frequency: nightly
  2521. team: core
  2522. cluster:
  2523. byod: {}
  2524. cluster_compute: stress_tests/stress_tests_compute.yaml
  2525. run:
  2526. timeout: 7200
  2527. wait_for_nodes:
  2528. num_nodes: 101
  2529. script: python stress_tests/test_dead_actors.py
  2530. smoke_test:
  2531. frequency: nightly
  2532. cluster:
  2533. app_config: stress_tests/stress_tests_app_config.yaml
  2534. cluster_compute: stress_tests/smoke_test_compute.yaml
  2535. run:
  2536. timeout: 3600
  2537. wait_for_nodes:
  2538. num_nodes: 5
  2539. script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3
  2540. --num-children=3
  2541. variations:
  2542. - __suffix__: aws
  2543. - __suffix__: gce
  2544. env: gce
  2545. frequency: manual
  2546. cluster:
  2547. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  2548. smoke_test:
  2549. frequency: manual
  2550. # The full test is not stable, so run the smoke test only.
  2551. # See https://github.com/ray-project/ray/issues/23244.
  2552. - name: threaded_actors_stress_test
  2553. group: core-daily-test
  2554. working_dir: nightly_tests
  2555. frequency: nightly
  2556. team: core
  2557. cluster:
  2558. byod: {}
  2559. cluster_compute: stress_tests/smoke_test_compute.yaml
  2560. run:
  2561. timeout: 3600
  2562. script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  2563. 30
  2564. wait_for_nodes:
  2565. num_nodes: 5
  2566. variations:
  2567. - __suffix__: aws
  2568. - __suffix__: gce
  2569. env: gce
  2570. frequency: manual
  2571. cluster:
  2572. cluster_compute: stress_tests/smoke_test_compute_gce.yaml
  2573. # - name: threaded_actors_stress_test
  2574. # group: core-daily-test
  2575. # working_dir: nightly_tests
  2576. #
  2577. # frequency: nightly
  2578. # team: core
  2579. # cluster:
  2580. # cluster_compute: stress_tests/stress_test_threaded_actor_compute.yaml
  2581. #
  2582. # run:
  2583. # timeout: 7200
  2584. # script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s
  2585. # 60
  2586. #
  2587. # wait_for_nodes:
  2588. # num_nodes: 201
  2589. # timeout: 600
  2590. #
  2591. # smoke_test:
  2592. # frequency: nightly
  2593. # cluster:
  2594. # app_config: stress_tests/stress_tests_app_config.yaml
  2595. # cluster_compute: stress_tests/smoke_test_compute.yaml
  2596. #
  2597. # run:
  2598. # timeout: 3600
  2599. # script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  2600. # 30
  2601. #
  2602. # wait_for_nodes:
  2603. # num_nodes: 5
  2604. # timeout: 600
  2605. - name: stress_test_many_runtime_envs
  2606. group: core-daily-test
  2607. working_dir: nightly_tests
  2608. frequency: nightly
  2609. team: core
  2610. cluster:
  2611. byod: {}
  2612. cluster_compute: stress_tests/smoke_test_compute.yaml
  2613. run:
  2614. timeout: 14400
  2615. wait_for_nodes:
  2616. num_nodes: 5
  2617. script: python stress_tests/test_many_runtime_envs.py --num_runtime_envs=100 --num_tasks=10000
  2618. variations:
  2619. - __suffix__: aws
  2620. - __suffix__: gce
  2621. env: gce
  2622. frequency: manual
  2623. cluster:
  2624. cluster_compute: stress_tests/smoke_test_compute_gce.yaml
  2625. smoke_test:
  2626. frequency: manual
  2627. - name: single_node_oom
  2628. group: core-daily-test
  2629. working_dir: nightly_tests
  2630. frequency: nightly
  2631. team: core
  2632. cluster:
  2633. byod: {}
  2634. cluster_compute: stress_tests/stress_tests_single_node_oom_compute.yaml
  2635. run:
  2636. timeout: 1000
  2637. script: python stress_tests/test_parallel_tasks_memory_pressure.py --num-tasks 20
  2638. variations:
  2639. - __suffix__: aws
  2640. - __suffix__: gce
  2641. env: gce
  2642. frequency: manual
  2643. cluster:
  2644. cluster_compute: stress_tests/stress_tests_single_node_oom_compute_gce.yaml
  2645. - name: tune_air_oom
  2646. group: core-daily-test
  2647. working_dir: air_tests
  2648. stable: false
  2649. frequency: nightly
  2650. team: core
  2651. cluster:
  2652. byod:
  2653. runtime_env:
  2654. - RAY_memory_usage_threshold=0.7
  2655. - RAY_task_oom_retries=-1
  2656. cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml
  2657. run:
  2658. timeout: 3600
  2659. script: bash oom/tune_air_oom.sh
  2660. - name: dask_on_ray_1tb_sort
  2661. group: core-daily-test
  2662. working_dir: nightly_tests
  2663. frequency: nightly-3x
  2664. team: core
  2665. cluster:
  2666. byod:
  2667. runtime_env:
  2668. - RAY_worker_killing_policy=retriable_lifo
  2669. cluster_compute: dask_on_ray/1tb_sort_compute.yaml
  2670. run:
  2671. timeout: 7200
  2672. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions
  2673. 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
  2674. wait_for_nodes:
  2675. num_nodes: 32
  2676. - name: many_nodes_actor_test_on_v2
  2677. group: core-daily-test
  2678. working_dir: benchmarks
  2679. frequency: nightly-3x
  2680. team: core
  2681. cluster:
  2682. byod: {}
  2683. cluster_compute: distributed/many_nodes_tests/compute_config.yaml
  2684. run:
  2685. timeout: 3600
  2686. # 2cpus per node x 1000 nodes / 0.2 cpus per actor = 10k
  2687. # 2cpus per node x 2000 nodes / 0.2 cpus per actor = 20k
  2688. script: python distributed/many_nodes_tests/actor_test.py --no-wait --cpus-per-actor=0.2 --total-actors 10000 20000
  2689. wait_for_nodes:
  2690. num_nodes: 500
  2691. variations:
  2692. - __suffix__: aws
  2693. - __suffix__: gce
  2694. env: gce
  2695. frequency: manual
  2696. cluster:
  2697. cluster_compute: distributed/many_nodes_tests/compute_config_gce.yaml
  2698. #- name: many_nodes_multi_master_test
  2699. # group: core-daily-test
  2700. # working_dir: nightly_tests
  2701. #
  2702. # frequency: nightly-3x
  2703. # team: core
  2704. # cluster:
  2705. # cluster_compute: many_nodes_tests/compute_config.yaml
  2706. #
  2707. # run:
  2708. # timeout: 7200
  2709. # script: python many_nodes_tests/multi_master_test.py
  2710. # wait_for_nodes:
  2711. # num_nodes: 251
  2712. #
  2713. - name: pg_autoscaling_regression_test
  2714. group: core-daily-test
  2715. working_dir: nightly_tests
  2716. frequency: nightly
  2717. team: core
  2718. cluster:
  2719. byod: {}
  2720. cluster_compute: placement_group_tests/compute.yaml
  2721. run:
  2722. timeout: 1200
  2723. script: python placement_group_tests/pg_run.py
  2724. variations:
  2725. - __suffix__: aws
  2726. - __suffix__: gce
  2727. env: gce
  2728. frequency: manual
  2729. cluster:
  2730. cluster_compute: placement_group_tests/compute_gce.yaml
  2731. - name: placement_group_performance_test
  2732. group: core-daily-test
  2733. working_dir: nightly_tests
  2734. frequency: nightly
  2735. team: core
  2736. cluster:
  2737. byod: {}
  2738. cluster_compute: placement_group_tests/pg_perf_test_compute.yaml
  2739. run:
  2740. timeout: 1200
  2741. script: python placement_group_tests/placement_group_performance_test.py
  2742. wait_for_nodes:
  2743. num_nodes: 5
  2744. variations:
  2745. - __suffix__: aws
  2746. - __suffix__: gce
  2747. env: gce
  2748. frequency: manual
  2749. cluster:
  2750. cluster_compute: placement_group_tests/pg_perf_test_compute_gce.yaml
  2751. #########################
  2752. # Core Scalability Tests
  2753. #########################
  2754. - name: single_node
  2755. group: core-scalability-test
  2756. working_dir: benchmarks
  2757. frequency: nightly
  2758. team: core
  2759. cluster:
  2760. byod:
  2761. type: gpu
  2762. runtime_env:
  2763. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2764. cluster_compute: single_node.yaml
  2765. run:
  2766. timeout: 12000
  2767. prepare: sleep 0
  2768. script: python single_node/test_single_node.py
  2769. variations:
  2770. - __suffix__: aws
  2771. - __suffix__: gce
  2772. env: gce
  2773. frequency: manual
  2774. cluster:
  2775. cluster_compute: single_node_gce.yaml
  2776. - name: object_store
  2777. group: core-scalability-test
  2778. working_dir: benchmarks
  2779. frequency: nightly
  2780. team: core
  2781. cluster:
  2782. byod:
  2783. type: gpu
  2784. runtime_env:
  2785. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2786. cluster_compute: object_store.yaml
  2787. run:
  2788. timeout: 3600
  2789. script: python object_store/test_object_store.py
  2790. wait_for_nodes:
  2791. num_nodes: 50
  2792. variations:
  2793. - __suffix__: aws
  2794. - __suffix__: gce
  2795. env: gce
  2796. frequency: manual
  2797. cluster:
  2798. cluster_compute: object_store_gce.yaml
  2799. - name: many_actors
  2800. group: core-scalability-test
  2801. working_dir: benchmarks
  2802. frequency: nightly-3x
  2803. team: core
  2804. cluster:
  2805. byod:
  2806. type: gpu
  2807. runtime_env:
  2808. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2809. cluster_compute: distributed.yaml
  2810. run:
  2811. timeout: 3600
  2812. script: python distributed/test_many_actors.py
  2813. wait_for_nodes:
  2814. num_nodes: 65
  2815. variations:
  2816. - __suffix__: aws
  2817. - __suffix__: gce
  2818. env: gce
  2819. frequency: manual
  2820. cluster:
  2821. cluster_compute: distributed_gce.yaml
  2822. - name: many_actors_smoke_test
  2823. group: core-scalability-test
  2824. working_dir: benchmarks
  2825. frequency: nightly
  2826. team: core
  2827. cluster:
  2828. byod:
  2829. type: gpu
  2830. runtime_env:
  2831. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2832. cluster_compute: distributed_smoke_test.yaml
  2833. run:
  2834. timeout: 3600
  2835. script: SMOKE_TEST=1 python distributed/test_many_actors.py
  2836. wait_for_nodes:
  2837. num_nodes: 2
  2838. - name: many_tasks
  2839. group: core-scalability-test
  2840. working_dir: benchmarks
  2841. frequency: nightly
  2842. team: core
  2843. cluster:
  2844. byod:
  2845. type: gpu
  2846. runtime_env:
  2847. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2848. cluster_compute: distributed.yaml
  2849. run:
  2850. timeout: 3600
  2851. script: python distributed/test_many_tasks.py --num-tasks=10000
  2852. wait_for_nodes:
  2853. num_nodes: 65
  2854. variations:
  2855. - __suffix__: aws
  2856. - __suffix__: gce
  2857. env: gce
  2858. frequency: manual
  2859. cluster:
  2860. cluster_compute: distributed_gce.yaml
  2861. - name: many_pgs
  2862. group: core-scalability-test
  2863. working_dir: benchmarks
  2864. frequency: nightly-3x
  2865. team: core
  2866. cluster:
  2867. byod:
  2868. type: gpu
  2869. runtime_env:
  2870. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2871. cluster_compute: distributed.yaml
  2872. run:
  2873. timeout: 3600
  2874. script: python distributed/test_many_pgs.py
  2875. wait_for_nodes:
  2876. num_nodes: 65
  2877. variations:
  2878. - __suffix__: aws
  2879. - __suffix__: gce
  2880. env: gce
  2881. frequency: manual
  2882. cluster:
  2883. cluster_compute: distributed_gce.yaml
  2884. - name: many_pgs_smoke_test
  2885. group: core-scalability-test
  2886. working_dir: benchmarks
  2887. frequency: nightly
  2888. team: core
  2889. cluster:
  2890. byod:
  2891. type: gpu
  2892. runtime_env:
  2893. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2894. cluster_compute: distributed_smoke_test.yaml
  2895. run:
  2896. timeout: 3600
  2897. script: SMOKE_TEST=1 python distributed/test_many_pgs.py
  2898. wait_for_nodes:
  2899. num_nodes: 2
  2900. - name: many_nodes
  2901. group: core-scalability-test
  2902. working_dir: benchmarks
  2903. frequency: nightly-3x
  2904. team: core
  2905. cluster:
  2906. byod:
  2907. type: gpu
  2908. runtime_env:
  2909. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2910. cluster_compute: many_nodes.yaml
  2911. run:
  2912. timeout: 3600
  2913. script: python distributed/test_many_tasks.py --num-tasks=1000
  2914. wait_for_nodes:
  2915. num_nodes: 250
  2916. variations:
  2917. - __suffix__: aws
  2918. - __suffix__: gce
  2919. env: gce
  2920. frequency: manual
  2921. cluster:
  2922. cluster_compute: many_nodes_gce.yaml
  2923. - name: scheduling_test_many_0s_tasks_many_nodes
  2924. group: core-scalability-test
  2925. working_dir: benchmarks
  2926. frequency: nightly
  2927. team: core
  2928. cluster:
  2929. byod:
  2930. type: gpu
  2931. runtime_env:
  2932. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2933. cluster_compute: scheduling.yaml
  2934. run:
  2935. timeout: 3600
  2936. script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  2937. --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
  2938. wait_for_nodes:
  2939. num_nodes: 32
  2940. variations:
  2941. - __suffix__: aws
  2942. - __suffix__: gce
  2943. env: gce
  2944. frequency: manual
  2945. cluster:
  2946. cluster_compute: scheduling_gce.yaml
  2947. # - name: scheduling_test_many_5s_tasks_single_node
  2948. # group: core-scalability-test
  2949. # working_dir: benchmarks
  2950. # frequency: nightly
  2951. # team: core
  2952. # cluster:
  2953. # cluster_compute: scheduling.yaml
  2954. # run:
  2955. # timeout: 3600
  2956. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  2957. # --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
  2958. # wait_for_nodes:
  2959. # num_nodes: 32
  2960. # timeout: 600
  2961. # stable: false
  2962. # - name: scheduling_test_many_5s_tasks_many_nodes
  2963. # group: core-scalability-test
  2964. # working_dir: benchmarks
  2965. # frequency: nightly
  2966. # team: core
  2967. # cluster:
  2968. # cluster_compute: scheduling.yaml
  2969. # run:
  2970. # timeout: 3600
  2971. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  2972. # --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
  2973. # wait_for_nodes:
  2974. # num_nodes: 32
  2975. # timeout: 600
  2976. # stable: false
  2977. ###############
  2978. # Dataset tests
  2979. ###############
  2980. - name: parquet_metadata_resolution
  2981. group: data-tests
  2982. working_dir: nightly_tests/dataset
  2983. frequency: nightly
  2984. team: data
  2985. cluster:
  2986. byod:
  2987. type: gpu
  2988. cluster_compute: single_node_benchmark_compute.yaml
  2989. run:
  2990. # Expect the test to finish around 40 seconds.
  2991. timeout: 100
  2992. script: python parquet_metadata_resolution.py --num-files 915 --cloud aws
  2993. variations:
  2994. - __suffix__: aws
  2995. - __suffix__: gce
  2996. env: gce
  2997. frequency: manual
  2998. cluster:
  2999. cluster_compute: single_node_benchmark_compute_gce.yaml
  3000. run:
  3001. script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp
  3002. - name: dataset_random_access
  3003. group: data-tests
  3004. working_dir: nightly_tests/dataset
  3005. stable: false
  3006. frequency: manual
  3007. team: data
  3008. cluster:
  3009. byod:
  3010. type: gpu
  3011. pip:
  3012. - git+https://github.com/ray-project/ray_shuffling_data_loader.git@add-embedding-model
  3013. cluster_compute: pipelined_training_compute.yaml
  3014. run:
  3015. timeout: 1200
  3016. script: python dataset_random_access.py
  3017. wait_for_nodes:
  3018. num_nodes: 15
  3019. variations:
  3020. - __suffix__: aws
  3021. - __suffix__: gce
  3022. env: gce
  3023. frequency: manual
  3024. cluster:
  3025. cluster_compute: pipelined_training_compute_gce.yaml
  3026. - name: stable_diffusion_benchmark
  3027. group: data-tests
  3028. working_dir: nightly_tests/dataset
  3029. frequency: nightly
  3030. team: data
  3031. cluster:
  3032. byod:
  3033. type: gpu
  3034. post_build_script: byod_stable_diffusion.sh
  3035. cluster_compute: stable_diffusion_benchmark_compute.yaml
  3036. run:
  3037. timeout: 1800
  3038. script: python stable_diffusion_benchmark.py
  3039. variations:
  3040. - __suffix__: aws
  3041. - __suffix__: gce
  3042. env: gce
  3043. frequency: manual
  3044. cluster:
  3045. cluster_compute: stable_diffusion_benchmark_compute_gce.yaml
  3046. - name: streaming_data_ingest_benchmark_1tb
  3047. group: data-tests
  3048. working_dir: nightly_tests/dataset
  3049. frequency: nightly
  3050. team: data
  3051. cluster:
  3052. byod:
  3053. type: gpu
  3054. cluster_compute: data_ingest_benchmark_compute.yaml
  3055. run:
  3056. timeout: 300
  3057. script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming
  3058. wait_for_nodes:
  3059. num_nodes: 20
  3060. variations:
  3061. - __suffix__: aws
  3062. - __suffix__: gce
  3063. env: gce
  3064. frequency: manual
  3065. cluster:
  3066. cluster_compute: data_ingest_benchmark_compute_gce.yaml
  3067. - name: streaming_data_ingest_benchmark_100gb_gpu
  3068. group: data-tests
  3069. working_dir: nightly_tests/dataset
  3070. frequency: nightly
  3071. team: data
  3072. cluster:
  3073. byod:
  3074. type: gpu
  3075. cluster_compute: data_ingest_benchmark_compute_gpu.yaml
  3076. run:
  3077. timeout: 300
  3078. script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu
  3079. wait_for_nodes:
  3080. num_nodes: 3
  3081. variations:
  3082. - __suffix__: aws
  3083. - __suffix__: gce
  3084. env: gce
  3085. frequency: manual
  3086. cluster:
  3087. cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
  3088. # This test case will early stop the data ingestion iteration on the GPU actors.
  3089. # This is a common usage in PyTorch Lightning
  3090. # (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches).
  3091. # There was a bug in Ray Data that caused GPU memoy leak (see #3.919).
  3092. # We add this test case to cover this scenario.
  3093. - name: streaming_data_ingest_benchmark_100gb_gpu_early_stop
  3094. group: data-tests
  3095. working_dir: nightly_tests/dataset
  3096. frequency: nightly
  3097. team: data
  3098. cluster:
  3099. byod:
  3100. type: gpu
  3101. cluster_compute: data_ingest_benchmark_compute_gpu.yaml
  3102. run:
  3103. timeout: 300
  3104. script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu --early-stop
  3105. wait_for_nodes:
  3106. num_nodes: 3
  3107. variations:
  3108. - __suffix__: aws
  3109. - __suffix__: gce
  3110. env: gce
  3111. frequency: manual
  3112. cluster:
  3113. cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
  3114. - name: aggregate_benchmark
  3115. group: data-tests
  3116. working_dir: nightly_tests/dataset
  3117. frequency: nightly
  3118. team: data
  3119. cluster:
  3120. byod:
  3121. type: gpu
  3122. cluster_compute: single_node_benchmark_compute.yaml
  3123. run:
  3124. timeout: 1800
  3125. script: python aggregate_benchmark.py
  3126. variations:
  3127. - __suffix__: aws
  3128. - __suffix__: gce
  3129. env: gce
  3130. frequency: manual
  3131. cluster:
  3132. cluster_compute: single_node_benchmark_compute_gce.yaml
  3133. - name: read_parquet_benchmark_single_node
  3134. group: data-tests
  3135. working_dir: nightly_tests/dataset
  3136. frequency: nightly
  3137. team: data
  3138. cluster:
  3139. byod:
  3140. type: gpu
  3141. post_build_script: byod_install_mosaicml.sh
  3142. cluster_compute: single_node_benchmark_compute.yaml
  3143. run:
  3144. # Expect the benchmark to finish in 400 seconds.
  3145. timeout: 400
  3146. script: python read_parquet_benchmark.py
  3147. variations:
  3148. - __suffix__: aws
  3149. - __suffix__: gce
  3150. env: gce
  3151. frequency: manual
  3152. cluster:
  3153. cluster_compute: single_node_benchmark_compute_gce.yaml
  3154. - name: read_images_benchmark_single_node
  3155. group: data-tests
  3156. working_dir: nightly_tests/dataset
  3157. frequency: nightly
  3158. team: data
  3159. cluster:
  3160. byod:
  3161. type: gpu
  3162. post_build_script: byod_install_mosaicml.sh
  3163. cluster_compute: single_node_benchmark_compute.yaml
  3164. run:
  3165. timeout: 1800
  3166. script: python read_images_benchmark.py --single-node
  3167. variations:
  3168. - __suffix__: aws
  3169. - __suffix__: gce
  3170. env: gce
  3171. frequency: manual
  3172. cluster:
  3173. cluster_compute: single_node_benchmark_compute_gce.yaml
  3174. # TODO: Re-enable this test once we fix https://github.com/ray-project/ray/issues/40686.
  3175. # - name: read_images_benchmark_multi_node
  3176. # group: data-tests
  3177. # working_dir: nightly_tests/dataset
  3178. # frequency: nightly-3x
  3179. # team: data
  3180. # cluster:
  3181. # byod:
  3182. # type: gpu
  3183. # cluster_compute: multi_node_read_images_benchmark_compute.yaml
  3184. # run:
  3185. # timeout: 28800
  3186. # script: python read_images_benchmark.py --multi-node
  3187. # variations:
  3188. # - __suffix__: aws
  3189. # - __suffix__: gce
  3190. # env: gce
  3191. # frequency: manual
  3192. # cluster:
  3193. # cluster_compute: multi_node_read_images_benchmark_compute_gce.yaml
  3194. - name: read_images_comparison_microbenchmark_single_node
  3195. group: data-tests
  3196. working_dir: nightly_tests/dataset
  3197. frequency: nightly
  3198. team: data
  3199. cluster:
  3200. byod:
  3201. type: gpu
  3202. post_build_script: byod_install_mosaicml.sh
  3203. cluster_compute: single_node_benchmark_compute.yaml
  3204. run:
  3205. timeout: 1800
  3206. script: bash run_image_loader_microbenchmark.sh
  3207. variations:
  3208. - __suffix__: aws
  3209. - __suffix__: gce
  3210. env: gce
  3211. frequency: manual
  3212. cluster:
  3213. cluster_compute: single_node_benchmark_compute_gce.yaml
  3214. - name: read_images_train_4_gpu
  3215. group: data-tests
  3216. working_dir: nightly_tests/dataset
  3217. frequency: nightly
  3218. team: data
  3219. cluster:
  3220. byod:
  3221. type: gpu
  3222. post_build_script: byod_install_mosaicml.sh
  3223. cluster_compute: multi_node_train_4_workers.yaml
  3224. run:
  3225. timeout: 18000
  3226. script: python multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 2
  3227. variations:
  3228. - __suffix__: aws
  3229. - __suffix__: gce
  3230. env: gce
  3231. frequency: manual
  3232. cluster:
  3233. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3234. - name: read_images_train_4_gpu_worker_chaos
  3235. group: data-tests
  3236. working_dir: nightly_tests
  3237. frequency: nightly
  3238. team: data
  3239. cluster:
  3240. byod:
  3241. type: gpu
  3242. post_build_script: byod_install_mosaicml.sh
  3243. cluster_compute: dataset/multi_node_train_4_workers.yaml
  3244. run:
  3245. timeout: 18000
  3246. prepare: python setup_chaos.py --kill-workers --kill-interval 100 --max-to-kill 3 --task-names "ReadImage->Map(wnid_to_index)->Map(crop_and_flip_image)"
  3247. script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
  3248. variations:
  3249. - __suffix__: aws
  3250. - __suffix__: gce
  3251. env: gce
  3252. frequency: manual
  3253. cluster:
  3254. cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3255. - name: read_images_train_4_gpu_node_chaos
  3256. group: data-tests
  3257. working_dir: nightly_tests
  3258. frequency: nightly
  3259. team: data
  3260. cluster:
  3261. byod:
  3262. type: gpu
  3263. post_build_script: byod_install_mosaicml.sh
  3264. cluster_compute: dataset/multi_node_train_4_workers.yaml
  3265. run:
  3266. timeout: 18000
  3267. prepare: python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names "_RayTrainWorker__execute.get_next"
  3268. script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
  3269. variations:
  3270. - __suffix__: aws
  3271. - __suffix__: gce
  3272. env: gce
  3273. frequency: manual
  3274. cluster:
  3275. cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3276. - name: read_images_train_16_gpu
  3277. group: data-tests
  3278. working_dir: nightly_tests/dataset
  3279. frequency: nightly
  3280. team: data
  3281. cluster:
  3282. byod:
  3283. type: gpu
  3284. post_build_script: byod_install_mosaicml.sh
  3285. cluster_compute: multi_node_train_16_workers.yaml
  3286. run:
  3287. timeout: 18000
  3288. script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --use-gpu --num-epochs 2
  3289. variations:
  3290. - __suffix__: aws
  3291. - __suffix__: gce
  3292. env: gce
  3293. frequency: manual
  3294. cluster:
  3295. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
  3296. - name: read_images_train_16_gpu_preserve_order
  3297. group: data-tests
  3298. working_dir: nightly_tests/dataset
  3299. frequency: nightly
  3300. team: data
  3301. cluster:
  3302. byod:
  3303. type: gpu
  3304. post_build_script: byod_install_mosaicml.sh
  3305. cluster_compute: multi_node_train_16_workers.yaml
  3306. run:
  3307. timeout: 18000
  3308. script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --preserve-order --use-gpu --num-epochs 2
  3309. variations:
  3310. - __suffix__: aws
  3311. - __suffix__: gce
  3312. env: gce
  3313. frequency: manual
  3314. cluster:
  3315. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
  3316. - name: read_parquet_train_4_gpu
  3317. group: data-tests
  3318. working_dir: nightly_tests/dataset
  3319. frequency: nightly
  3320. team: data
  3321. cluster:
  3322. byod:
  3323. type: gpu
  3324. post_build_script: byod_install_mosaicml.sh
  3325. cluster_compute: multi_node_train_4_workers.yaml
  3326. run:
  3327. timeout: 3600
  3328. script: python multi_node_train_benchmark.py --num-workers 4 --file-type parquet --target-worker-gb 50 --use-gpu
  3329. variations:
  3330. - __suffix__: aws
  3331. - __suffix__: gce
  3332. env: gce
  3333. frequency: manual
  3334. cluster:
  3335. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3336. - name: read_parquet_train_16_gpu
  3337. group: data-tests
  3338. working_dir: nightly_tests/dataset
  3339. frequency: nightly
  3340. team: data
  3341. cluster:
  3342. byod:
  3343. type: gpu
  3344. post_build_script: byod_install_mosaicml.sh
  3345. cluster_compute: multi_node_train_16_workers.yaml
  3346. run:
  3347. timeout: 3600
  3348. script: python multi_node_train_benchmark.py --num-workers 16 --file-type parquet --target-worker-gb 50 --use-gpu
  3349. variations:
  3350. - __suffix__: aws
  3351. - __suffix__: gce
  3352. env: gce
  3353. frequency: manual
  3354. cluster:
  3355. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
  3356. - name: read_images_train_1_gpu_5_cpu
  3357. group: data-tests
  3358. working_dir: nightly_tests/dataset
  3359. frequency: nightly
  3360. team: data
  3361. cluster:
  3362. byod:
  3363. type: gpu
  3364. post_build_script: byod_install_mosaicml.sh
  3365. cluster_compute: multi_node_train_1g5c.yaml
  3366. run:
  3367. timeout: 2400
  3368. script: python multi_node_train_benchmark.py --num-workers 1 --file-type image --use-gpu --num-epochs 2 --skip-train-model --prefetch-batches 16 --batch-size -1 --disable-locality-with-output
  3369. variations:
  3370. - __suffix__: aws
  3371. - __suffix__: gce
  3372. env: gce
  3373. frequency: manual
  3374. cluster:
  3375. cluster_compute: compute_gpu_1g5c_gce.yaml
  3376. - name: read_tfrecords_benchmark_single_node
  3377. group: data-tests
  3378. working_dir: nightly_tests/dataset
  3379. frequency: nightly
  3380. team: data
  3381. cluster:
  3382. byod:
  3383. type: gpu
  3384. post_build_script: byod_install_mosaicml.sh
  3385. cluster_compute: single_node_benchmark_compute.yaml
  3386. run:
  3387. # Expect the benchmark to finish around 22 minutes.
  3388. timeout: 1800
  3389. script: python read_tfrecords_benchmark.py
  3390. variations:
  3391. - __suffix__: aws
  3392. - __suffix__: gce
  3393. env: gce
  3394. frequency: manual
  3395. cluster:
  3396. cluster_compute: single_node_benchmark_compute_gce.yaml
  3397. - name: map_batches_benchmark_single_node
  3398. group: data-tests
  3399. working_dir: nightly_tests/dataset
  3400. frequency: nightly
  3401. team: data
  3402. cluster:
  3403. byod:
  3404. type: gpu
  3405. cluster_compute: single_node_benchmark_compute.yaml
  3406. run:
  3407. # Expect the benchmark to finish around 30 minutes.
  3408. timeout: 2400
  3409. script: python map_batches_benchmark.py
  3410. variations:
  3411. - __suffix__: aws
  3412. - __suffix__: gce
  3413. env: gce
  3414. frequency: manual
  3415. cluster:
  3416. cluster_compute: single_node_benchmark_compute_gce.yaml
  3417. - name: iter_tensor_batches_benchmark_single_node
  3418. group: data-tests
  3419. working_dir: nightly_tests/dataset
  3420. frequency: nightly
  3421. team: data
  3422. cluster:
  3423. byod:
  3424. type: gpu
  3425. cluster_compute: single_node_benchmark_compute.yaml
  3426. run:
  3427. # Expect the benchmark to finish around 30 minutes.
  3428. timeout: 2400
  3429. script: python iter_tensor_batches_benchmark.py
  3430. variations:
  3431. - __suffix__: aws
  3432. - __suffix__: gce
  3433. env: gce
  3434. frequency: manual
  3435. cluster:
  3436. cluster_compute: single_node_benchmark_compute_gce.yaml
  3437. - name: iter_tensor_batches_benchmark_multi_node
  3438. group: data-tests
  3439. working_dir: nightly_tests/dataset
  3440. frequency: nightly
  3441. team: data
  3442. cluster:
  3443. byod:
  3444. type: gpu
  3445. cluster_compute: multi_node_benchmark_compute.yaml
  3446. run:
  3447. # Expect the benchmark to finish within 90 minutes.
  3448. timeout: 5400
  3449. script: python iter_tensor_batches_benchmark.py --data-size-gb=10
  3450. variations:
  3451. - __suffix__: aws
  3452. - __suffix__: gce
  3453. env: gce
  3454. frequency: manual
  3455. cluster:
  3456. cluster_compute: multi_node_benchmark_compute_gce.yaml
  3457. - name: iter_batches_benchmark_single_node
  3458. group: data-tests
  3459. working_dir: nightly_tests/dataset
  3460. frequency: nightly
  3461. team: data
  3462. cluster:
  3463. byod:
  3464. type: gpu
  3465. cluster_compute: single_node_benchmark_compute.yaml
  3466. run:
  3467. # Expect the benchmark to finish around 12 minutes.
  3468. timeout: 1080
  3469. script: python iter_batches_benchmark.py
  3470. variations:
  3471. - __suffix__: aws
  3472. - __suffix__: gce
  3473. env: gce
  3474. frequency: manual
  3475. cluster:
  3476. cluster_compute: single_node_benchmark_compute_gce.yaml
  3477. - name: dataset_shuffle_random_shuffle_1tb
  3478. group: data-tests
  3479. working_dir: nightly_tests
  3480. frequency: nightly
  3481. team: data
  3482. cluster:
  3483. byod:
  3484. runtime_env:
  3485. - RAY_worker_killing_policy=retriable_lifo
  3486. pip:
  3487. - ray[default]
  3488. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3489. run:
  3490. timeout: 7200
  3491. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3492. wait_for_nodes:
  3493. num_nodes: 20
  3494. variations:
  3495. - __suffix__: aws
  3496. - __suffix__: gce
  3497. env: gce
  3498. frequency: manual
  3499. cluster:
  3500. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3501. - name: dataset_shuffle_sort_1tb
  3502. group: data-tests
  3503. working_dir: nightly_tests
  3504. frequency: nightly
  3505. team: data
  3506. stable: False
  3507. cluster:
  3508. byod:
  3509. runtime_env:
  3510. - RAY_worker_killing_policy=retriable_lifo
  3511. pip:
  3512. - ray[default]
  3513. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3514. run:
  3515. timeout: 7200
  3516. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3517. wait_for_nodes:
  3518. num_nodes: 20
  3519. variations:
  3520. - __suffix__: aws
  3521. - __suffix__: gce
  3522. env: gce
  3523. frequency: manual
  3524. cluster:
  3525. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3526. - name: dataset_shuffle_push_based_random_shuffle_1tb
  3527. group: data-tests
  3528. working_dir: nightly_tests
  3529. stable: false
  3530. frequency: nightly
  3531. team: data
  3532. cluster:
  3533. byod:
  3534. runtime_env:
  3535. - RAY_worker_killing_policy=retriable_lifo
  3536. pip:
  3537. - ray[default]
  3538. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3539. run:
  3540. timeout: 7200
  3541. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3542. wait_for_nodes:
  3543. num_nodes: 20
  3544. variations:
  3545. - __suffix__: aws
  3546. - __suffix__: gce
  3547. env: gce
  3548. frequency: manual
  3549. cluster:
  3550. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3551. - name: dataset_shuffle_push_based_sort_1tb
  3552. group: data-tests
  3553. working_dir: nightly_tests
  3554. frequency: nightly
  3555. team: data
  3556. stable: False
  3557. cluster:
  3558. byod:
  3559. runtime_env:
  3560. - RAY_worker_killing_policy=retriable_lifo
  3561. pip:
  3562. - ray[default]
  3563. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3564. run:
  3565. timeout: 7200
  3566. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3567. wait_for_nodes:
  3568. num_nodes: 20
  3569. variations:
  3570. - __suffix__: aws
  3571. - __suffix__: gce
  3572. env: gce
  3573. frequency: manual
  3574. cluster:
  3575. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3576. - name: dataset_shuffle_push_based_random_shuffle_100tb
  3577. group: data-tests
  3578. working_dir: nightly_tests
  3579. stable: false
  3580. frequency: weekly
  3581. team: data
  3582. cluster:
  3583. byod:
  3584. runtime_env:
  3585. - RAY_object_spilling_config={"type":"filesystem","params":{"directory_path":["/tmp/data0","/tmp/data1"]}}
  3586. post_build_script: byod_dataset_shuffle.sh
  3587. cluster_compute: shuffle/100tb_shuffle_compute.yaml
  3588. run:
  3589. timeout: 28800
  3590. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle
  3591. wait_for_nodes:
  3592. num_nodes: 100
  3593. variations:
  3594. - __suffix__: aws
  3595. - __suffix__: gce
  3596. env: gce
  3597. frequency: manual
  3598. cluster:
  3599. cluster_compute: shuffle/100tb_shuffle_compute_gce.yaml
  3600. run:
  3601. timeout: 28800
  3602. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=40000 --partition-size=1e9 --shuffle
  3603. wait_for_nodes:
  3604. num_nodes: 100
  3605. ##################
  3606. # Core Chaos tests
  3607. ##################
  3608. - name: chaos_many_tasks_kill_raylet
  3609. group: core-nightly-test
  3610. working_dir: nightly_tests
  3611. frequency: nightly
  3612. team: core
  3613. cluster:
  3614. byod: {}
  3615. cluster_compute: chaos_test/compute_template.yaml
  3616. run:
  3617. timeout: 3600
  3618. wait_for_nodes:
  3619. num_nodes: 10
  3620. prepare: python setup_chaos.py --no-start
  3621. script: python chaos_test/test_chaos_basic.py --workload=tasks
  3622. variations:
  3623. - __suffix__: aws
  3624. - __suffix__: gce
  3625. env: gce
  3626. frequency: manual
  3627. cluster:
  3628. cluster_compute: chaos_test/compute_template_gce.yaml
  3629. - name: chaos_many_tasks_terminate_instance
  3630. group: core-nightly-test
  3631. working_dir: nightly_tests
  3632. frequency: nightly
  3633. team: core
  3634. cluster:
  3635. byod: {}
  3636. cluster_compute: chaos_test/compute_template.yaml
  3637. run:
  3638. timeout: 3600
  3639. wait_for_nodes:
  3640. num_nodes: 10
  3641. prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
  3642. script: python chaos_test/test_chaos_basic.py --workload=tasks
  3643. variations:
  3644. - __suffix__: aws
  3645. - name: chaos_many_actors_kill_raylet
  3646. group: core-nightly-test
  3647. working_dir: nightly_tests
  3648. frequency: nightly
  3649. team: core
  3650. cluster:
  3651. byod: {}
  3652. cluster_compute: chaos_test/compute_template.yaml
  3653. run:
  3654. timeout: 4200
  3655. wait_for_nodes:
  3656. num_nodes: 10
  3657. prepare: python setup_chaos.py --no-start
  3658. script: python chaos_test/test_chaos_basic.py --workload=actors
  3659. variations:
  3660. - __suffix__: aws
  3661. - __suffix__: gce
  3662. env: gce
  3663. frequency: manual
  3664. cluster:
  3665. cluster_compute: chaos_test/compute_template_gce.yaml
  3666. - name: chaos_many_actors_terminate_instance
  3667. group: core-nightly-test
  3668. working_dir: nightly_tests
  3669. frequency: nightly
  3670. team: core
  3671. cluster:
  3672. byod: {}
  3673. cluster_compute: chaos_test/compute_template.yaml
  3674. run:
  3675. timeout: 4200
  3676. wait_for_nodes:
  3677. num_nodes: 10
  3678. prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
  3679. script: python chaos_test/test_chaos_basic.py --workload=actors
  3680. variations:
  3681. - __suffix__: aws
  3682. - name: chaos_dask_on_ray_large_scale_test_no_spilling
  3683. group: data-tests
  3684. working_dir: nightly_tests
  3685. frequency: nightly
  3686. team: data
  3687. cluster:
  3688. byod:
  3689. runtime_env:
  3690. - RAY_lineage_pinning_enabled=1
  3691. cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
  3692. run:
  3693. timeout: 7200
  3694. wait_for_nodes:
  3695. num_nodes: 21
  3696. prepare: python setup_chaos.py --kill-interval 100
  3697. script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
  3698. 20 --error_rate 0 --data_save_path /tmp/ray
  3699. variations:
  3700. - __suffix__: aws
  3701. - __suffix__: gce
  3702. env: gce
  3703. frequency: manual
  3704. cluster:
  3705. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  3706. - name: chaos_dask_on_ray_large_scale_test_spilling
  3707. group: data-tests
  3708. working_dir: nightly_tests
  3709. frequency: nightly
  3710. team: data
  3711. cluster:
  3712. byod:
  3713. runtime_env:
  3714. - RAY_lineage_pinning_enabled=1
  3715. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  3716. run:
  3717. timeout: 7200
  3718. wait_for_nodes:
  3719. num_nodes: 21
  3720. prepare: python setup_chaos.py --kill-interval 100
  3721. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  3722. 70 --error_rate 0 --data_save_path /tmp/ray
  3723. variations:
  3724. - __suffix__: aws
  3725. - __suffix__: gce
  3726. env: gce
  3727. frequency: manual
  3728. cluster:
  3729. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  3730. - name: chaos_dataset_shuffle_push_based_sort_1tb
  3731. group: data-tests
  3732. working_dir: nightly_tests
  3733. stable: false
  3734. frequency: nightly
  3735. team: data
  3736. cluster:
  3737. byod:
  3738. runtime_env:
  3739. - RAY_worker_killing_policy=retriable_lifo
  3740. pip:
  3741. - ray[default]
  3742. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3743. run:
  3744. timeout: 7200
  3745. prepare: 'python setup_chaos.py --kill-interval 1200 --max-to-kill 3'
  3746. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3747. wait_for_nodes:
  3748. num_nodes: 20
  3749. variations:
  3750. - __suffix__: aws
  3751. - __suffix__: gce
  3752. env: gce
  3753. frequency: manual
  3754. cluster:
  3755. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3756. - name: chaos_dataset_shuffle_sort_1tb
  3757. group: data-tests
  3758. working_dir: nightly_tests
  3759. stable: false
  3760. frequency: nightly
  3761. team: data
  3762. cluster:
  3763. byod:
  3764. runtime_env:
  3765. - RAY_memory_monitor_refresh_ms=0
  3766. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3767. run:
  3768. timeout: 7200
  3769. prepare: 'python setup_chaos.py --kill-interval 900 --max-to-kill 3'
  3770. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3771. wait_for_nodes:
  3772. num_nodes: 20
  3773. variations:
  3774. - __suffix__: aws
  3775. - __suffix__: gce
  3776. env: gce
  3777. frequency: manual
  3778. cluster:
  3779. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3780. - name: chaos_dataset_shuffle_random_shuffle_1tb
  3781. group: data-tests
  3782. working_dir: nightly_tests
  3783. stable: false
  3784. frequency: nightly
  3785. team: data
  3786. cluster:
  3787. # leave oom disabled as test is marked unstable at the moment.
  3788. byod:
  3789. runtime_env:
  3790. - RAY_memory_monitor_refresh_ms=0
  3791. pip:
  3792. - ray[default]
  3793. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3794. run:
  3795. timeout: 7200
  3796. prepare: ' python setup_chaos.py --kill-interval 600 --max-to-kill 2'
  3797. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3798. wait_for_nodes:
  3799. num_nodes: 20
  3800. variations:
  3801. - __suffix__: aws
  3802. - __suffix__: gce
  3803. env: gce
  3804. frequency: manual
  3805. cluster:
  3806. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3807. - name: chaos_dataset_shuffle_push_based_random_shuffle_1tb
  3808. group: data-tests
  3809. working_dir: nightly_tests
  3810. stable: false
  3811. frequency: nightly
  3812. team: data
  3813. cluster:
  3814. # leave oom disabled as test is marked unstable at the moment.
  3815. byod:
  3816. runtime_env:
  3817. - RAY_memory_monitor_refresh_ms=0
  3818. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3819. run:
  3820. timeout: 7200
  3821. prepare: ' python setup_chaos.py --kill-interval 600 --max-to-kill 2'
  3822. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3823. wait_for_nodes:
  3824. num_nodes: 20
  3825. variations:
  3826. - __suffix__: aws
  3827. - __suffix__: gce
  3828. env: gce
  3829. frequency: manual
  3830. cluster:
  3831. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3832. #####################
  3833. # Observability tests
  3834. #####################
  3835. - name: agent_stress_test
  3836. group: core-observability-test
  3837. working_dir: dashboard
  3838. frequency: nightly
  3839. team: core
  3840. cluster:
  3841. byod:
  3842. type: gpu
  3843. runtime_env:
  3844. - RAY_INTERNAL_MEM_PROFILE_COMPONENTS=dashboard_agent
  3845. post_build_script: byod_agent_stress_test.sh
  3846. cluster_compute: agent_stress_compute.yaml
  3847. run:
  3848. timeout: 14400
  3849. script: python mem_check.py --working-dir .
  3850. variations:
  3851. - __suffix__: aws
  3852. - __suffix__: gce
  3853. env: gce
  3854. frequency: manual
  3855. cluster:
  3856. cluster_compute: agent_stress_compute_gce.yaml
  3857. - name: k8s_serve_ha_test
  3858. group: k8s-test
  3859. working_dir: k8s_tests
  3860. stable: false
  3861. frequency: nightly
  3862. team: serve
  3863. cluster:
  3864. byod: {}
  3865. cluster_compute: compute_tpl.yaml
  3866. run:
  3867. timeout: 28800 # 8h
  3868. prepare: bash prepare.sh
  3869. script: python run_gcs_ft_on_k8s.py
  3870. - name: aws_cluster_launcher
  3871. group: cluster-launcher-test
  3872. working_dir: ../python/ray/autoscaler/
  3873. frequency: nightly
  3874. team: clusters
  3875. cluster:
  3876. byod: {}
  3877. cluster_compute: aws/tests/aws_compute.yaml
  3878. run:
  3879. timeout: 2400
  3880. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10
  3881. - name: aws_cluster_launcher_nightly_image
  3882. group: cluster-launcher-test
  3883. working_dir: ../python/ray/autoscaler/
  3884. frequency: nightly
  3885. team: clusters
  3886. cluster:
  3887. byod: {}
  3888. cluster_compute: aws/tests/aws_compute.yaml
  3889. run:
  3890. timeout: 2400
  3891. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override nightly
  3892. - name: aws_cluster_launcher_latest_image
  3893. group: cluster-launcher-test
  3894. working_dir: ../python/ray/autoscaler/
  3895. frequency: nightly
  3896. team: clusters
  3897. cluster:
  3898. byod: {}
  3899. cluster_compute: aws/tests/aws_compute.yaml
  3900. run:
  3901. timeout: 2400
  3902. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override latest
  3903. - name: aws_cluster_launcher_release_image
  3904. group: cluster-launcher-test
  3905. working_dir: ../python/ray/autoscaler/
  3906. frequency: manual
  3907. team: clusters
  3908. cluster:
  3909. byod: {}
  3910. cluster_compute: aws/tests/aws_compute.yaml
  3911. run:
  3912. timeout: 2400
  3913. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override commit
  3914. - name: aws_cluster_launcher_minimal
  3915. group: cluster-launcher-test
  3916. working_dir: ../python/ray/autoscaler/
  3917. frequency: nightly
  3918. team: clusters
  3919. cluster:
  3920. byod: {}
  3921. cluster_compute: aws/tests/aws_compute.yaml
  3922. run:
  3923. timeout: 1200
  3924. script: python launch_and_verify_cluster.py aws/example-minimal.yaml
  3925. - name: aws_cluster_launcher_full
  3926. group: cluster-launcher-test
  3927. working_dir: ../python/ray/autoscaler/
  3928. frequency: nightly
  3929. team: clusters
  3930. cluster:
  3931. byod: {}
  3932. cluster_compute: aws/tests/aws_compute.yaml
  3933. run:
  3934. timeout: 3000
  3935. script: python launch_and_verify_cluster.py aws/example-full.yaml --num-expected-nodes 2 --retries 20
  3936. - name: gcp_cluster_launcher_minimal
  3937. group: cluster-launcher-test
  3938. working_dir: ../python/ray/autoscaler/
  3939. stable: true
  3940. env: gce
  3941. frequency: nightly
  3942. team: clusters
  3943. cluster:
  3944. byod: {}
  3945. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3946. run:
  3947. timeout: 1200
  3948. script: python launch_and_verify_cluster.py gcp/example-minimal.yaml
  3949. - name: gcp_cluster_launcher_full
  3950. group: cluster-launcher-test
  3951. working_dir: ../python/ray/autoscaler/
  3952. stable: true
  3953. env: gce
  3954. frequency: nightly
  3955. team: clusters
  3956. cluster:
  3957. byod: {}
  3958. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3959. run:
  3960. timeout: 4800
  3961. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 30
  3962. - name: gcp_cluster_launcher_latest_image
  3963. group: cluster-launcher-test
  3964. working_dir: ../python/ray/autoscaler/
  3965. stable: true
  3966. env: gce
  3967. frequency: nightly
  3968. team: clusters
  3969. cluster:
  3970. byod: {}
  3971. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3972. run:
  3973. timeout: 3600
  3974. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest
  3975. - name: gcp_cluster_launcher_nightly_image
  3976. group: cluster-launcher-test
  3977. working_dir: ../python/ray/autoscaler/
  3978. stable: true
  3979. env: gce
  3980. frequency: nightly
  3981. team: clusters
  3982. cluster:
  3983. byod: {}
  3984. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3985. run:
  3986. timeout: 3600
  3987. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly
  3988. - name: gcp_cluster_launcher_release_image
  3989. group: cluster-launcher-test
  3990. working_dir: ../python/ray/autoscaler/
  3991. stable: true
  3992. env: gce
  3993. frequency: manual
  3994. team: clusters
  3995. cluster:
  3996. byod: {}
  3997. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3998. run:
  3999. timeout: 3600
  4000. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit
  4001. - name: gcp_cluster_launcher_gpu_docker
  4002. group: cluster-launcher-test
  4003. working_dir: ../python/ray/autoscaler/
  4004. stable: true
  4005. env: gce
  4006. frequency: weekly
  4007. team: clusters
  4008. cluster:
  4009. byod: {}
  4010. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  4011. run:
  4012. timeout: 1200
  4013. script: python launch_and_verify_cluster.py gcp/example-gpu-docker.yaml
  4014. - name: autoscaler_aws
  4015. group: autoscaler-test
  4016. working_dir: autoscaling_tests
  4017. stable: False
  4018. frequency: nightly
  4019. team: core
  4020. cluster:
  4021. # leave oom disabled as test is marked unstable at the moment.
  4022. byod:
  4023. runtime_env:
  4024. - RAY_memory_monitor_refresh_ms=0
  4025. pip:
  4026. - ray[default]
  4027. cluster_compute: aws.yaml
  4028. run:
  4029. timeout: 1800
  4030. script: python run.py