release_tests.yaml 111 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064
  1. # Global release test configuration file.
  2. # All your release test configuration should go here. Adding release tests here
  3. # will automatically enable them in the Buildkite release testing schedules
  4. # (except they have frequency: manual).
  5. # Here is an example configuration for reference:
  6. #- name: example_test
  7. # # Tests with the same group will be grouped in the Buildkite UI
  8. # group: Example group
  9. # # Provide the working directory which will be uploaded to the cluster
  10. # working_dir: example_dir
  11. #
  12. # # How often to run the tests.
  13. # # One of [manual, any, multi, nightly, nightly-3x, weekly].
  14. # # Descriptions of each frequency (that's not immediately obvious):
  15. # # - manual: Not run on a schedule, but can be manually run through the buildkite UI.
  16. # # - nightly-3x: Run 3 times a week (Monday, Wednesday, Friday).
  17. # frequency: weekly
  18. # # Owning team. This field will be persisted to the database
  19. # team: ml
  20. #
  21. # # Python version. This optional field determines which Python version to run tests
  22. # # on. This must be a string!
  23. # python: "3.7"
  24. #
  25. # # Cluster information
  26. # cluster:
  27. # # Location of cluster compute, relative to working_dir
  28. # cluster_compute: cluster_compute.yaml
  29. # # Autosuspend parameter passed to the cluster.
  30. # # The cluster will automatically terminate if inactive for this
  31. # # many minutes. Defaults to 10 if not set.
  32. # autosuspend_mins: 10
  33. # # Optional cloud_id to use instead of the default cloud
  34. # cloud_id: cld_12345678
  35. # # Alternatively, you can specify a cloud name
  36. # cloud_name: anyscale_default_cloud
  37. #
  38. # # Run configuration for the test
  39. # run:
  40. # # If you want to wait for nodes to be ready, you can specify this here:
  41. # wait_for_nodes:
  42. # # Number of nodes
  43. # num_nodes: 16
  44. # # Timeout for waiting for nodes. If nodes are not up by then, the
  45. # # test will fail.
  46. # timeout: 600
  47. #
  48. # # Optional prepare script to be run on the cluster before the test script
  49. # prepare: python prepare.py
  50. # # The prepare command can have a separate timeout
  51. # prepare_timeout: 300
  52. #
  53. # # Main script to run as the test script
  54. # script: python workloads/train_small.py
  55. # # Timeout in seconds. After this time the test is considered as failed.
  56. # timeout: 600
  57. #
  58. # # You can specify smoke test definitions here. If a smoke test is triggered,
  59. # # it will deep update the main test configuration with the values provided
  60. # # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
  61. # # environment variable and receive the --smoke-test flag as a parameter in the
  62. # # run script.
  63. # smoke_test:
  64. # # Smoke tests can have different frequencies. A smoke test is only triggered
  65. # # when the regular test is not matched.
  66. # frequency: nightly
  67. # # Here we adjust the run timeout down and run on less nodes. The test script
  68. # # remains the same.
  69. # run:
  70. # timeout: 300
  71. # wait_for_nodes:
  72. # num_nodes: 4
  73. # timeout: 600
  74. #
  75. # # After the test finished, this handler (in alerts/) will process the results.
  76. # # It can then let the test fail, e.g. if a metric regression is observed.
  77. # alert: default
  78. #######################
  79. # Cluster scaling tests
  80. #######################
  81. - name: cluster_tune_scale_up_down
  82. group: Cluster tests
  83. working_dir: cluster_tests
  84. frequency: nightly
  85. team: ml
  86. cluster:
  87. byod: {}
  88. cluster_compute: cpt_autoscaling_1-3_aws.yaml
  89. run:
  90. timeout: 3600
  91. script: python workloads/tune_scale_up_down.py
  92. wait_for_nodes:
  93. num_nodes: 0
  94. variations:
  95. - __suffix__: aws
  96. - __suffix__: gce
  97. env: gce
  98. frequency: manual
  99. cluster:
  100. cluster_compute: cpt_autoscaling_1-3_gce.yaml
  101. alert: default
  102. ############################
  103. # Batch Inference Benchmarks
  104. ############################
  105. # 10 GB image classification raw images with 1 GPU.
  106. # 1 g4dn.4xlarge
  107. - name: torch_batch_inference_1_gpu_10gb_raw
  108. group: data-tests
  109. working_dir: nightly_tests/dataset
  110. frequency: nightly
  111. team: data
  112. cluster:
  113. byod:
  114. type: gpu
  115. cluster_compute: compute_gpu_1_cpu_16_aws.yaml
  116. run:
  117. timeout: 500
  118. script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw
  119. alert: default
  120. variations:
  121. - __suffix__: aws
  122. - __suffix__: gce
  123. env: gce
  124. frequency: manual
  125. cluster:
  126. cluster_compute: compute_gpu_1_cpu_16_gce.yaml
  127. # 10 GB image classification parquet with 1 GPU.
  128. # 1 g4dn.4xlarge
  129. - name: torch_batch_inference_1_gpu_10gb_parquet
  130. group: data-tests
  131. working_dir: nightly_tests/dataset
  132. frequency: nightly
  133. team: data
  134. cluster:
  135. byod:
  136. type: gpu
  137. cluster_compute: compute_gpu_1_cpu_16_aws.yaml
  138. run:
  139. timeout: 500
  140. script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet
  141. alert: default
  142. variations:
  143. - __suffix__: aws
  144. - __suffix__: gce
  145. env: gce
  146. frequency: manual
  147. cluster:
  148. cluster_compute: compute_gpu_1_cpu_16_gce.yaml
  149. # 300 GB image classification raw images with 16 GPUs
  150. # 4 g4dn.12xlarge
  151. - name: torch_batch_inference_16_gpu_300gb_raw
  152. group: data-tests
  153. working_dir: nightly_tests/dataset
  154. frequency: nightly
  155. team: data
  156. cluster:
  157. byod:
  158. type: gpu
  159. cluster_compute: compute_gpu_4x4_aws.yaml
  160. run:
  161. timeout: 1000
  162. script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
  163. wait_for_nodes:
  164. num_nodes: 4
  165. alert: default
  166. variations:
  167. - __suffix__: aws
  168. - __suffix__: gce
  169. env: gce
  170. frequency: manual
  171. cluster:
  172. cluster_compute: compute_gpu_4x4_gce.yaml
  173. - name: chaos_torch_batch_inference_16_gpu_300gb_raw
  174. group: data-tests
  175. working_dir: nightly_tests
  176. stable: false
  177. frequency: nightly
  178. team: data
  179. cluster:
  180. byod:
  181. type: gpu
  182. cluster_compute: dataset/compute_gpu_4x4_aws.yaml
  183. run:
  184. timeout: 1000
  185. prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
  186. script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
  187. wait_for_nodes:
  188. num_nodes: 4
  189. alert: default
  190. variations:
  191. - __suffix__: aws
  192. - __suffix__: gce
  193. env: gce
  194. frequency: manual
  195. cluster:
  196. cluster_compute: dataset/compute_gpu_4x4_gce.yaml
  197. # 300 GB image classification parquet data with 16 GPUs
  198. # 4 g4dn.12xlarge
  199. - name: torch_batch_inference_16_gpu_300gb_parquet
  200. group: data-tests
  201. working_dir: nightly_tests/dataset
  202. frequency: nightly
  203. team: data
  204. cluster:
  205. byod:
  206. type: gpu
  207. cluster_compute: compute_gpu_4x4_aws.yaml
  208. run:
  209. timeout: 1000
  210. script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
  211. wait_for_nodes:
  212. num_nodes: 4
  213. alert: default
  214. variations:
  215. - __suffix__: aws
  216. - __suffix__: gce
  217. env: gce
  218. frequency: manual
  219. cluster:
  220. cluster_compute: compute_gpu_4x4_gce.yaml
  221. # 10 TB image classification parquet data with heterogenous cluster
  222. # 10 g4dn.12xlarge, 10 m5.16xlarge
  223. - name: torch_batch_inference_hetero_10tb_parquet
  224. group: data-tests
  225. working_dir: nightly_tests/dataset
  226. frequency: weekly
  227. team: data
  228. cluster:
  229. byod:
  230. type: gpu
  231. cluster_compute: compute_hetero_10x10_aws.yaml
  232. run:
  233. timeout: 2000
  234. script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
  235. wait_for_nodes:
  236. num_nodes: 20
  237. alert: default
  238. #########################
  239. # AIR release tests
  240. #########################
  241. - name: tune_with_frequent_pausing
  242. group: AIR tests
  243. working_dir: air_tests
  244. frequency: nightly-3x
  245. team: ml
  246. cluster:
  247. byod:
  248. runtime_env:
  249. - RAY_memory_usage_threshold=0.5
  250. - automatic_object_spilling_enabled=0
  251. cluster_compute: frequent_pausing/compute_config_aws.yaml
  252. run:
  253. timeout: 600 # 10min
  254. long_running: true
  255. script: python frequent_pausing/script.py
  256. variations:
  257. - __suffix__: aws
  258. - __suffix__: gce
  259. env: gce
  260. frequency: manual
  261. cluster:
  262. cluster_compute: frequent_pausing/compute_config_gce.yaml
  263. alert: default
  264. - name: long_running_horovod_tune_test
  265. group: AIR tests
  266. working_dir: air_tests
  267. frequency: weekly
  268. team: ml
  269. cluster:
  270. byod:
  271. type: gpu
  272. post_build_script: byod_horovod_master_test.sh
  273. cluster_compute: horovod/compute_tpl_aws.yaml
  274. variations:
  275. - __suffix__: aws
  276. - __suffix__: gce
  277. env: gce
  278. frequency: manual
  279. cluster:
  280. cluster_compute: horovod/compute_tpl_gce.yaml
  281. run:
  282. timeout: 36000
  283. script: python horovod/workloads/horovod_tune_test.py
  284. long_running: true
  285. wait_for_nodes:
  286. num_nodes: 2
  287. smoke_test:
  288. frequency: manual
  289. run:
  290. timeout: 3600
  291. alert: default
  292. # Ray AIR distributed Torch benchmarks
  293. - name: air_benchmark_torch_mnist_cpu_4x1
  294. group: AIR tests
  295. working_dir: air_tests/air_benchmarks
  296. frequency: nightly
  297. team: ml
  298. cluster:
  299. byod:
  300. type: gpu
  301. cluster_compute: compute_cpu_4_aws.yaml
  302. run:
  303. timeout: 3600
  304. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  305. wait_for_nodes:
  306. num_nodes: 4
  307. variations:
  308. - __suffix__: aws
  309. - __suffix__: gce
  310. env: gce
  311. frequency: manual
  312. cluster:
  313. cluster_compute: compute_cpu_4_gce.yaml
  314. alert: default
  315. - name: air_benchmark_torch_mnist_gpu_4x4
  316. group: AIR tests
  317. working_dir: air_tests/air_benchmarks
  318. frequency: weekly
  319. team: ml
  320. cluster:
  321. byod:
  322. type: gpu
  323. cluster_compute: compute_gpu_4x4_aws.yaml
  324. run:
  325. timeout: 4800
  326. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 120 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  327. wait_for_nodes:
  328. num_nodes: 4
  329. smoke_test:
  330. frequency: nightly
  331. cluster:
  332. cluster_compute: compute_gpu_2x2_aws.yaml
  333. run:
  334. timeout: 3600
  335. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  336. wait_for_nodes:
  337. num_nodes: 2
  338. variations:
  339. - __suffix__: aws
  340. - __suffix__: gce
  341. env: gce
  342. frequency: manual
  343. cluster:
  344. cluster_compute: compute_gpu_4x4_gce.yaml
  345. smoke_test:
  346. frequency: manual
  347. alert: default
  348. - name: air_benchmark_torch_mnist_cpu_1x4
  349. group: AIR tests
  350. working_dir: air_tests/air_benchmarks
  351. frequency: nightly
  352. team: ml
  353. cluster:
  354. byod:
  355. type: gpu
  356. cluster_compute: compute_cpu_1_aws.yaml
  357. run:
  358. timeout: 3600
  359. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  360. variations:
  361. - __suffix__: aws
  362. - __suffix__: gce
  363. env: gce
  364. frequency: manual
  365. cluster:
  366. cluster_compute: compute_cpu_1_gce.yaml
  367. alert: default
  368. - name: air_benchmark_torch_mnist_cpu_4x4
  369. group: AIR tests
  370. working_dir: air_tests/air_benchmarks
  371. frequency: nightly
  372. team: ml
  373. cluster:
  374. byod:
  375. type: gpu
  376. cluster_compute: compute_cpu_4_aws.yaml
  377. run:
  378. timeout: 5400
  379. script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  380. wait_for_nodes:
  381. num_nodes: 4
  382. variations:
  383. - __suffix__: aws
  384. - __suffix__: gce
  385. env: gce
  386. frequency: manual
  387. cluster:
  388. cluster_compute: compute_cpu_4_gce.yaml
  389. alert: default
  390. - name: air_benchmark_tune_torch_mnist
  391. group: AIR tests
  392. working_dir: air_tests/air_benchmarks
  393. frequency: nightly
  394. team: ml
  395. cluster:
  396. byod:
  397. type: gpu
  398. cluster_compute: compute_cpu_8_aws.yaml
  399. run:
  400. timeout: 3600
  401. script: python workloads/tune_torch_benchmark.py --num-runs 3 --num-trials 8 --num-workers 4
  402. wait_for_nodes:
  403. num_nodes: 8
  404. variations:
  405. - __suffix__: aws
  406. - __suffix__: gce
  407. env: gce
  408. frequency: manual
  409. cluster:
  410. cluster_compute: compute_cpu_8_gce.yaml
  411. alert: default
  412. - name: air_benchmark_tune_torch_mnist_gpu
  413. group: AIR tests
  414. working_dir: air_tests/air_benchmarks
  415. frequency: nightly
  416. team: ml
  417. cluster:
  418. byod:
  419. type: gpu
  420. cluster_compute: compute_gpu_4x4_aws.yaml
  421. run:
  422. timeout: 3600
  423. script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 4 --use-gpu
  424. wait_for_nodes:
  425. num_nodes: 4
  426. variations:
  427. - __suffix__: aws
  428. - __suffix__: gce
  429. env: gce
  430. frequency: manual
  431. cluster:
  432. cluster_compute: compute_gpu_4x4_gce.yaml
  433. alert: default
  434. # Ray AIR distributed Tensorflow benchmarks
  435. - name: air_benchmark_tensorflow_mnist_cpu_4x1
  436. group: AIR tests
  437. working_dir: air_tests/air_benchmarks
  438. frequency: nightly
  439. team: ml
  440. cluster:
  441. byod:
  442. type: gpu
  443. cluster_compute: compute_cpu_4_aws.yaml
  444. run:
  445. timeout: 5400
  446. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
  447. wait_for_nodes:
  448. num_nodes: 4
  449. variations:
  450. - __suffix__: aws
  451. - __suffix__: gce
  452. env: gce
  453. frequency: manual
  454. cluster:
  455. cluster_compute: compute_cpu_4_gce.yaml
  456. alert: default
  457. - name: air_benchmark_tensorflow_mnist_cpu_1x4
  458. group: AIR tests
  459. working_dir: air_tests/air_benchmarks
  460. frequency: nightly
  461. team: ml
  462. cluster:
  463. byod:
  464. type: gpu
  465. cluster_compute: compute_cpu_1_aws.yaml
  466. run:
  467. timeout: 5400
  468. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
  469. variations:
  470. - __suffix__: aws
  471. - __suffix__: gce
  472. env: gce
  473. frequency: manual
  474. cluster:
  475. cluster_compute: compute_cpu_1_gce.yaml
  476. alert: default
  477. - name: air_benchmark_tensorflow_mnist_cpu_4x4
  478. group: AIR tests
  479. working_dir: air_tests/air_benchmarks
  480. frequency: nightly
  481. team: ml
  482. stable: false
  483. cluster:
  484. byod:
  485. type: gpu
  486. cluster_compute: compute_cpu_4_aws.yaml
  487. run:
  488. timeout: 5400
  489. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
  490. wait_for_nodes:
  491. num_nodes: 4
  492. variations:
  493. - __suffix__: aws
  494. - __suffix__: gce
  495. env: gce
  496. frequency: manual
  497. cluster:
  498. cluster_compute: compute_cpu_4_gce.yaml
  499. alert: default
  500. - name: air_benchmark_tensorflow_mnist_gpu_4x4
  501. group: AIR tests
  502. working_dir: air_tests/air_benchmarks
  503. frequency: weekly
  504. team: ml
  505. stable: false
  506. cluster:
  507. byod:
  508. type: gpu
  509. cluster_compute: compute_gpu_4x4_aws.yaml
  510. run:
  511. timeout: 5400
  512. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
  513. wait_for_nodes:
  514. num_nodes: 4
  515. smoke_test:
  516. frequency: nightly
  517. cluster:
  518. cluster_compute: compute_gpu_2x2_aws.yaml
  519. run:
  520. script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
  521. wait_for_nodes:
  522. num_nodes: 2
  523. variations:
  524. - __suffix__: aws
  525. - __suffix__: gce
  526. env: gce
  527. frequency: manual
  528. cluster:
  529. cluster_compute: compute_gpu_4x4_gce.yaml
  530. smoke_test:
  531. frequency: manual
  532. alert: default
  533. - name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
  534. group: AIR tests
  535. working_dir: air_tests/air_benchmarks
  536. frequency: nightly
  537. team: ml
  538. cluster:
  539. byod:
  540. type: gpu
  541. cluster_compute: compute_gpu_1_aws.yaml
  542. run:
  543. timeout: 3600
  544. script: python workloads/pytorch_training_e2e.py --data-size-gb 20
  545. alert: default
  546. variations:
  547. - __suffix__: aws
  548. - __suffix__: gce
  549. env: gce
  550. frequency: manual
  551. cluster:
  552. cluster_compute: compute_gpu_1_gce.yaml
  553. - name: air_benchmark_pytorch_training_e2e_gpu_4x4_100gb
  554. group: AIR tests
  555. working_dir: air_tests/air_benchmarks
  556. frequency: nightly
  557. team: ml
  558. stable: false
  559. cluster:
  560. byod:
  561. type: gpu
  562. cluster_compute: compute_gpu_4x4_aws.yaml
  563. run:
  564. timeout: 10800
  565. script: python workloads/pytorch_training_e2e.py --data-size-gb=100 --num-workers=16
  566. wait_for_nodes:
  567. num_nodes: 4
  568. alert: default
  569. variations:
  570. - __suffix__: aws
  571. - __suffix__: gce
  572. env: gce
  573. frequency: manual
  574. cluster:
  575. cluster_compute: compute_gpu_4x4_gce.yaml
  576. # Test tiny, and medium input files to check that performance stays about
  577. # constant.
  578. - name: ray-data-resnet50-ingest-file-size-benchmark
  579. group: AIR tests
  580. working_dir: air_tests/air_benchmarks/mlperf-train
  581. frequency: nightly
  582. team: data
  583. cluster:
  584. byod:
  585. type: gpu
  586. runtime_env:
  587. - RAY_task_oom_retries=50
  588. - RAY_min_memory_free_bytes=1000000000
  589. cluster_compute: compute_cpu_16.yaml
  590. run:
  591. timeout: 3600
  592. script: bash file_size_benchmark.sh
  593. variations:
  594. - __suffix__: aws
  595. - __suffix__: gce
  596. env: gce
  597. frequency: manual
  598. cluster:
  599. cluster_compute: compute_gce_cpu_16.yaml
  600. # Test huge files to check that we do not OOM.
  601. - name: ray-data-resnet50-ingest-out-of-memory-benchmark
  602. group: AIR tests
  603. working_dir: air_tests/air_benchmarks/mlperf-train
  604. stable: false
  605. frequency: nightly
  606. team: data
  607. cluster:
  608. byod:
  609. type: gpu
  610. runtime_env:
  611. - RAY_task_oom_retries=50
  612. - RAY_min_memory_free_bytes=1000000000
  613. cluster_compute: compute_cpu_16.yaml
  614. run:
  615. timeout: 3600
  616. script: bash oom_benchmark.sh
  617. variations:
  618. - __suffix__: aws
  619. - __suffix__: gce
  620. env: gce
  621. frequency: manual
  622. cluster:
  623. cluster_compute: compute_gce_cpu_16.yaml
  624. #######################
  625. # AIR examples
  626. #######################
  627. # Test additional CPU nodes for preprocessing.
  628. - name: air_example_dreambooth_finetuning
  629. group: AIR examples
  630. working_dir: air_examples/dreambooth
  631. stable: false
  632. frequency: weekly
  633. team: ml
  634. cluster:
  635. byod:
  636. type: gpu
  637. cluster_compute: dreambooth_compute_aws.yaml
  638. run:
  639. timeout: 1800
  640. script: pip install -Ur dreambooth/requirements.txt && bash dreambooth_run.sh
  641. artifact_path: /tmp/artifacts/example_out.jpg
  642. # variations: A10G not available on GCE, yet.
  643. - name: air_example_dreambooth_finetuning_lora
  644. group: AIR examples
  645. working_dir: air_examples/dreambooth
  646. stable: false
  647. frequency: weekly
  648. team: ml
  649. cluster:
  650. byod:
  651. type: gpu
  652. cluster_compute: dreambooth_compute_aws.yaml
  653. run:
  654. timeout: 1800
  655. script: pip install -Ur dreambooth/requirements.txt && bash dreambooth_run.sh --lora
  656. artifact_path: /tmp/artifacts/example_out.jpg
  657. - name: air_example_gptj_deepspeed_fine_tuning
  658. group: AIR examples
  659. working_dir: air_examples/gptj_deepspeed_finetuning
  660. frequency: weekly
  661. team: ml
  662. cluster:
  663. byod:
  664. type: gpu
  665. post_build_script: byod_gptj_test.sh
  666. cluster_compute: gptj_deepspeed_compute_aws.yaml
  667. run:
  668. timeout: 4500
  669. script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb
  670. - name: air_example_dolly_v2_lightning_fsdp_finetuning
  671. group: AIR examples
  672. working_dir: air_examples/dolly_v2_lightning_fsdp_finetuning
  673. frequency: weekly
  674. team: ml
  675. cluster:
  676. byod:
  677. type: gpu
  678. post_build_script: byod_dolly_test.sh
  679. cluster_compute: dolly_v2_fsdp_compute_aws.yaml
  680. run:
  681. timeout: 4700
  682. script: python test_myst_doc.py --path lightning-llm-finetuning-7b.ipynb
  683. # variations: TODO(jungong): add GCP variation.
  684. - name: air_example_vicuna_13b_lightning_deepspeed_finetuning
  685. group: AIR examples
  686. working_dir: air_examples/vicuna_13b_lightning_deepspeed_finetuning
  687. frequency: weekly
  688. team: ml
  689. cluster:
  690. byod:
  691. type: gpu
  692. post_build_script: byod_vicuna_test.sh
  693. cluster_compute: vicuna_13b_deepspeed_compute_aws.yaml
  694. run:
  695. timeout: 4700
  696. script: python test_myst_doc.py --path vicuna_13b_lightning_deepspeed_finetune.ipynb
  697. #####################################
  698. # Workspace templates release tests #
  699. #####################################
  700. - name: workspace_template_batch_inference
  701. group: Workspace templates
  702. working_dir: workspace_templates/01_batch_inference
  703. frequency: nightly-3x
  704. team: data
  705. cluster:
  706. byod:
  707. type: gpu
  708. cluster_compute: ../testing/compute_configs/gpu/aws.yaml
  709. run:
  710. timeout: 600
  711. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  712. variations:
  713. - __suffix__: aws
  714. - __suffix__: gce
  715. env: gce
  716. frequency: manual
  717. cluster:
  718. cluster_compute: ../testing/compute_configs/gpu/gce.yaml
  719. - name: workspace_template_many_model_training
  720. group: Workspace templates
  721. working_dir: workspace_templates/02_many_model_training
  722. frequency: nightly-3x
  723. team: ml
  724. cluster:
  725. byod:
  726. type: gpu
  727. cluster_compute: ../testing/compute_configs/cpu/aws.yaml
  728. run:
  729. timeout: 600
  730. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py
  731. variations:
  732. - __suffix__: aws
  733. - __suffix__: gce
  734. env: gce
  735. frequency: manual
  736. cluster:
  737. cluster_compute: ../testing/compute_configs/cpu/gce.yaml
  738. - name: workspace_template_serving_stable_diffusion
  739. group: Workspace templates
  740. working_dir: workspace_templates/03_serving_stable_diffusion
  741. frequency: nightly-3x
  742. team: serve
  743. cluster:
  744. byod:
  745. type: gpu
  746. post_build_script: byod_stable_diffusion.sh
  747. cluster_compute: ../testing/compute_configs/gpu/aws.yaml
  748. run:
  749. timeout: 600
  750. script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py && serve run app:entrypoint --non-blocking && python query.py
  751. variations:
  752. - __suffix__: aws
  753. - __suffix__: gce
  754. env: gce
  755. frequency: manual
  756. cluster:
  757. cluster_compute: ../testing/compute_configs/gpu/gce.yaml
  758. - name: workspace_template_finetuning_llms_with_deepspeed_llama_2_7b
  759. group: Workspace templates
  760. working_dir: workspace_templates/04_finetuning_llms_with_deepspeed
  761. frequency: nightly-3x
  762. team: ml
  763. cluster:
  764. byod:
  765. type: cu123
  766. # This needs to be in sync with requirements under go/llm-forge.
  767. post_build_script: byod_finetune_llvms.sh
  768. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
  769. run:
  770. timeout: 1000
  771. script: chmod +x ./run_llama_ft.sh && ./run_llama_ft.sh --size=7b --as-test
  772. variations:
  773. - __suffix__: aws
  774. - __suffix__: gce
  775. env: gce
  776. frequency: manual
  777. cluster:
  778. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/gce_7b.yaml
  779. - name: workspace_template_finetuning_llms_with_deepspeed_llama_2_7b_lora
  780. group: Workspace templates
  781. working_dir: workspace_templates/04_finetuning_llms_with_deepspeed
  782. frequency: nightly-3x
  783. team: ml
  784. cluster:
  785. byod:
  786. type: cu123
  787. # This needs to be in sync with requirements under go/llm-forge.
  788. post_build_script: byod_finetune_llvms.sh
  789. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
  790. run:
  791. timeout: 1000
  792. script: chmod +x ./run_llama_ft.sh && ./run_llama_ft.sh --size=7b --lora --as-test
  793. variations:
  794. - __suffix__: aws
  795. - __suffix__: gce
  796. env: gce
  797. frequency: manual
  798. cluster:
  799. cluster_compute: ../testing/compute_configs/04_finetuning_llms_with_deepspeed/gce_7b.yaml
  800. #######################
  801. # ML user tests
  802. #######################
  803. - name: ml_user_horovod_user_test_latest
  804. group: ML user tests
  805. working_dir: ml_user_tests
  806. frequency: nightly-3x
  807. team: ml
  808. cluster:
  809. byod:
  810. type: gpu
  811. post_build_script: byod_horovod_test.sh
  812. cluster_compute: horovod/compute_tpl_aws.yaml
  813. run:
  814. timeout: 1200
  815. script: python horovod/horovod_user_test.py
  816. wait_for_nodes:
  817. num_nodes: 4
  818. variations:
  819. - __suffix__: aws
  820. - __suffix__: gce
  821. env: gce
  822. frequency: manual
  823. cluster:
  824. cluster_compute: horovod/compute_tpl_gce.yaml
  825. alert: default
  826. - name: ml_user_horovod_user_test_master
  827. group: ML user tests
  828. working_dir: ml_user_tests
  829. frequency: nightly-3x
  830. team: ml
  831. cluster:
  832. byod:
  833. type: gpu
  834. post_build_script: byod_horovod_master_test.sh
  835. cluster_compute: horovod/compute_tpl_aws.yaml
  836. run:
  837. timeout: 1200
  838. script: python horovod/horovod_user_test.py
  839. wait_for_nodes:
  840. num_nodes: 4
  841. variations:
  842. - __suffix__: aws
  843. - __suffix__: gce
  844. env: gce
  845. frequency: manual
  846. cluster:
  847. cluster_compute: horovod/compute_tpl_gce.yaml
  848. alert: default
  849. - name: ml_user_train_tensorflow_mnist_test
  850. group: ML user tests
  851. working_dir: ml_user_tests
  852. frequency: nightly-3x
  853. team: ml
  854. cluster:
  855. byod:
  856. runtime_env:
  857. - TRAIN_PLACEMENT_GROUP_TIMEOUT_S=2000
  858. type: cu123
  859. cluster_compute: train/compute_tpl_aws.yaml
  860. run:
  861. timeout: 36000
  862. script: python train/train_tensorflow_mnist_test.py
  863. wait_for_nodes:
  864. num_nodes: 3
  865. variations:
  866. - __suffix__: aws
  867. - __suffix__: gce
  868. env: gce
  869. frequency: manual
  870. cluster:
  871. cluster_compute: train/compute_tpl_gce.yaml
  872. alert: default
  873. - name: ml_user_train_torch_linear_test
  874. group: ML user tests
  875. working_dir: ml_user_tests
  876. frequency: nightly-3x
  877. team: ml
  878. cluster:
  879. byod:
  880. runtime_env:
  881. - TRAIN_PLACEMENT_GROUP_TIMEOUT_S=2000
  882. type: gpu
  883. cluster_compute: train/compute_tpl_aws.yaml
  884. run:
  885. timeout: 36000
  886. script: python train/train_torch_linear_test.py
  887. wait_for_nodes:
  888. num_nodes: 3
  889. variations:
  890. - __suffix__: aws
  891. - __suffix__: gce
  892. env: gce
  893. frequency: manual
  894. cluster:
  895. cluster_compute: train/compute_tpl_gce.yaml
  896. alert: default
  897. - name: ml_user_tune_rllib_connect_test
  898. group: ML user tests
  899. working_dir: ml_user_tests
  900. frequency: nightly-3x
  901. team: ml
  902. cluster:
  903. byod:
  904. type: gpu
  905. post_build_script: byod_rllib_test.sh
  906. runtime_env:
  907. - RLLIB_TEST_NO_JAX_IMPORT=1
  908. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  909. cluster_compute: tune_rllib/compute_tpl_aws.yaml
  910. run:
  911. timeout: 2000
  912. script: python tune_rllib/run_connect_tests.py
  913. wait_for_nodes:
  914. num_nodes: 9
  915. variations:
  916. - __suffix__: aws
  917. - __suffix__: gce
  918. env: gce
  919. frequency: manual
  920. cluster:
  921. cluster_compute: tune_rllib/compute_tpl_gce.yaml
  922. alert: default
  923. #######################
  924. # Tune cloud tests
  925. #######################
  926. - name: tune_cloud_long_running_cloud_storage
  927. group: Tune cloud tests
  928. working_dir: tune_tests/cloud_tests
  929. frequency: weekly
  930. team: ml
  931. cluster:
  932. byod: {}
  933. cluster_compute: tpl_aws_1x4.yaml
  934. run:
  935. # 14 hours
  936. timeout: 50400
  937. long_running: true
  938. script: python workloads/long_running_cloud_storage.py s3://tune-cloud-tests/long_running_cloud_storage
  939. # NOTE: This smoke test is not useful to run because the point of the test
  940. # is to be long running. This is just for debugging updates to the test quickly.
  941. smoke_test:
  942. frequency: manual
  943. run:
  944. timeout: 600
  945. variations:
  946. - __suffix__: aws
  947. - __suffix__: gce
  948. env: gce
  949. frequency: manual
  950. cluster:
  951. cluster_compute: tpl_gce_1x4.yaml
  952. run:
  953. # 14 hours
  954. timeout: 50400
  955. long_running: true
  956. script: python workloads/long_running_cloud_storage.py gs://tune-cloud-tests/long_running_cloud_storage
  957. wait_for_nodes:
  958. num_nodes: 1
  959. alert: long_running_tests
  960. ########################
  961. # Tune scalability tests
  962. ########################
  963. - name: tune_scalability_bookkeeping_overhead
  964. group: Tune scalability tests
  965. working_dir: tune_tests/scalability_tests
  966. frequency: nightly
  967. team: ml
  968. cluster:
  969. byod: {}
  970. cluster_compute: tpl_1x16.yaml
  971. run:
  972. timeout: 1200
  973. script: python workloads/test_bookkeeping_overhead.py
  974. alert: tune_tests
  975. variations:
  976. - __suffix__: aws
  977. - __suffix__: gce
  978. env: gce
  979. frequency: manual
  980. cluster:
  981. cluster_compute: tpl_gce_1x16.yaml
  982. - name: tune_scalability_durable_trainable
  983. group: Tune scalability tests
  984. working_dir: tune_tests/scalability_tests
  985. frequency: nightly
  986. team: ml
  987. cluster:
  988. byod: {}
  989. cluster_compute: tpl_16x2.yaml
  990. run:
  991. timeout: 900
  992. script: python workloads/test_durable_trainable.py --bucket s3://tune-cloud-tests/scalability_durable_trainable
  993. wait_for_nodes:
  994. num_nodes: 16
  995. variations:
  996. - __suffix__: aws
  997. - __suffix__: gce
  998. env: gce
  999. frequency: manual
  1000. run:
  1001. timeout: 900
  1002. script: python workloads/test_durable_trainable.py --bucket gs://tune-cloud-tests/scalability_durable_trainable
  1003. wait_for_nodes:
  1004. num_nodes: 16
  1005. cluster:
  1006. cluster_compute: tpl_gce_16x2.yaml
  1007. alert: tune_tests
  1008. - name: tune_scalability_durable_multifile_checkpoints
  1009. group: Tune scalability tests
  1010. working_dir: tune_tests/scalability_tests
  1011. frequency: nightly
  1012. team: ml
  1013. cluster:
  1014. byod: {}
  1015. cluster_compute: tpl_16x2.yaml
  1016. run:
  1017. timeout: 900
  1018. script: python workloads/test_durable_multifile_checkpoints.py --bucket s3://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1019. wait_for_nodes:
  1020. num_nodes: 16
  1021. variations:
  1022. - __suffix__: aws
  1023. - __suffix__: gce
  1024. env: gce
  1025. frequency: manual
  1026. run:
  1027. timeout: 900
  1028. script: python workloads/test_durable_multifile_checkpoints.py --bucket gs://tune-cloud-tests/scalability_durable_multifile_checkpoints
  1029. wait_for_nodes:
  1030. num_nodes: 16
  1031. cluster:
  1032. cluster_compute: tpl_gce_16x2.yaml
  1033. alert: tune_tests
  1034. - name: tune_scalability_long_running_large_checkpoints
  1035. group: Tune scalability tests
  1036. working_dir: tune_tests/scalability_tests
  1037. frequency: weekly
  1038. team: ml
  1039. cluster:
  1040. byod: {}
  1041. cluster_compute: tpl_1x32_hd.yaml
  1042. run:
  1043. timeout: 86400
  1044. script: python workloads/test_long_running_large_checkpoints.py
  1045. long_running: true
  1046. smoke_test:
  1047. frequency: nightly
  1048. run:
  1049. timeout: 3600
  1050. alert: tune_tests
  1051. variations:
  1052. - __suffix__: aws
  1053. - __suffix__: gce
  1054. env: gce
  1055. frequency: manual
  1056. smoke_test:
  1057. frequency: manual
  1058. cluster:
  1059. cluster_compute: tpl_gce_1x32_hd.yaml
  1060. - name: tune_scalability_network_overhead
  1061. group: Tune scalability tests
  1062. working_dir: tune_tests/scalability_tests
  1063. frequency: weekly
  1064. team: ml
  1065. cluster:
  1066. byod: {}
  1067. cluster_compute: tpl_100x2.yaml
  1068. run:
  1069. timeout: 750
  1070. prepare_timeout: 1200
  1071. script: python workloads/test_network_overhead.py
  1072. wait_for_nodes:
  1073. num_nodes: 100
  1074. alert: tune_tests
  1075. variations:
  1076. - __suffix__: aws
  1077. - __suffix__: smoke-test
  1078. frequency: nightly
  1079. cluster:
  1080. cluster_compute: tpl_20x2.yaml
  1081. run:
  1082. timeout: 750
  1083. prepare_timeout: 600
  1084. script: python workloads/test_network_overhead.py --smoke-test
  1085. wait_for_nodes:
  1086. num_nodes: 20
  1087. - __suffix__: gce
  1088. env: gce
  1089. frequency: manual
  1090. cluster:
  1091. cluster_compute: tpl_gce_100x2.yaml
  1092. - name: tune_scalability_result_throughput_cluster
  1093. group: Tune scalability tests
  1094. working_dir: tune_tests/scalability_tests
  1095. frequency: nightly-3x
  1096. team: ml
  1097. cluster:
  1098. byod: {}
  1099. cluster_compute: tpl_16x64.yaml
  1100. run:
  1101. timeout: 600
  1102. script: python workloads/test_result_throughput_cluster.py
  1103. wait_for_nodes:
  1104. num_nodes: 16
  1105. alert: tune_tests
  1106. variations:
  1107. - __suffix__: aws
  1108. - __suffix__: gce
  1109. env: gce
  1110. frequency: manual
  1111. cluster:
  1112. cluster_compute: tpl_gce_16x64.yaml
  1113. - name: tune_scalability_result_throughput_single_node
  1114. group: Tune scalability tests
  1115. working_dir: tune_tests/scalability_tests
  1116. frequency: nightly
  1117. team: ml
  1118. cluster:
  1119. byod: {}
  1120. cluster_compute: tpl_1x96.yaml
  1121. run:
  1122. timeout: 600
  1123. script: python workloads/test_result_throughput_single_node.py
  1124. alert: tune_tests
  1125. variations:
  1126. - __suffix__: aws
  1127. - __suffix__: gce
  1128. env: gce
  1129. frequency: manual
  1130. cluster:
  1131. cluster_compute: tpl_gce_1x96.yaml
  1132. ############################
  1133. # Tune fault tolerance tests
  1134. ############################
  1135. - name: tune_worker_fault_tolerance
  1136. group: Tune fault tolerance tests
  1137. working_dir: tune_tests/fault_tolerance_tests
  1138. stable: true
  1139. frequency: nightly-3x
  1140. team: ml
  1141. cluster:
  1142. byod: {}
  1143. cluster_compute: tpl_aws_16x1.yaml
  1144. run:
  1145. timeout: 5400
  1146. script: python workloads/test_tune_worker_fault_tolerance.py --bucket s3://tune-cloud-tests/worker_fault_tolerance
  1147. wait_for_nodes:
  1148. num_nodes: 16
  1149. # Disabled until we can kill nodes in GCE
  1150. # variations:
  1151. # - __suffix__: aws
  1152. # - __suffix__: gce
  1153. # env: gce
  1154. # frequency: manual
  1155. # run:
  1156. # timeout: 5400
  1157. # script: python workloads/test_tune_worker_fault_tolerance.py --bucket gs://tune-cloud-tests/worker_fault_tolerance
  1158. #
  1159. # wait_for_nodes:
  1160. # num_nodes: 16
  1161. # cluster:
  1162. # cluster_compute: tpl_gce_16x1.yaml
  1163. ########################
  1164. # Golden Notebook tests
  1165. ########################
  1166. - name: golden_notebook_torch_tune_serve_test
  1167. group: Golden Notebook tests
  1168. working_dir: golden_notebook_tests
  1169. frequency: nightly-3x
  1170. team: ml
  1171. cluster:
  1172. byod:
  1173. type: gpu
  1174. cluster_compute: gpu_tpl_aws.yaml
  1175. run:
  1176. timeout: 600
  1177. script: python workloads/torch_tune_serve_test.py
  1178. wait_for_nodes:
  1179. num_nodes: 2
  1180. variations:
  1181. - __suffix__: aws
  1182. - __suffix__: gce
  1183. env: gce
  1184. frequency: manual
  1185. cluster:
  1186. cluster_compute: gpu_tpl_gce.yaml
  1187. alert: default
  1188. #######################
  1189. # Long running tests
  1190. #######################
  1191. - name: long_running_actor_deaths
  1192. group: Long running tests
  1193. working_dir: long_running_tests
  1194. frequency: weekly
  1195. team: core
  1196. cluster:
  1197. byod:
  1198. pip:
  1199. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1200. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1201. runtime_env:
  1202. - RLLIB_TEST_NO_JAX_IMPORT=1
  1203. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1204. cluster_compute: tpl_cpu_1.yaml
  1205. run:
  1206. timeout: 86400
  1207. script: python workloads/actor_deaths.py
  1208. long_running: true
  1209. smoke_test:
  1210. frequency: nightly
  1211. run:
  1212. timeout: 3600
  1213. alert: long_running_tests
  1214. variations:
  1215. - __suffix__: aws
  1216. - __suffix__: gce
  1217. env: gce
  1218. frequency: manual
  1219. smoke_test:
  1220. frequency: manual
  1221. cluster:
  1222. cluster_compute: tpl_cpu_1_gce.yaml
  1223. - name: long_running_apex
  1224. group: Long running tests
  1225. working_dir: long_running_tests
  1226. stable: false
  1227. frequency: weekly
  1228. team: rllib
  1229. cluster:
  1230. byod:
  1231. type: gpu
  1232. post_build_script: byod_rllib_test.sh
  1233. runtime_env:
  1234. - RLLIB_TEST_NO_JAX_IMPORT=1
  1235. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1236. cluster_compute: tpl_cpu_3.yaml
  1237. run:
  1238. timeout: 86400
  1239. script: python workloads/apex.py
  1240. long_running: true
  1241. wait_for_nodes:
  1242. num_nodes: 3
  1243. smoke_test:
  1244. frequency: nightly
  1245. run:
  1246. timeout: 3600
  1247. alert: long_running_tests
  1248. variations:
  1249. - __suffix__: aws
  1250. - __suffix__: gce
  1251. env: gce
  1252. frequency: manual
  1253. smoke_test:
  1254. frequency: manual
  1255. run:
  1256. timeout: 3600
  1257. cluster:
  1258. cluster_compute: tpl_cpu_3_gce.yaml
  1259. - name: long_running_impala
  1260. group: Long running tests
  1261. working_dir: long_running_tests
  1262. frequency: weekly
  1263. team: rllib
  1264. cluster:
  1265. byod:
  1266. type: gpu
  1267. post_build_script: byod_rllib_test.sh
  1268. runtime_env:
  1269. - RLLIB_TEST_NO_JAX_IMPORT=1
  1270. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1271. cluster_compute: tpl_cpu_1_large.yaml
  1272. run:
  1273. timeout: 86400
  1274. script: python workloads/impala.py
  1275. long_running: true
  1276. smoke_test:
  1277. frequency: nightly
  1278. run:
  1279. timeout: 3600
  1280. alert: long_running_tests
  1281. variations:
  1282. - __suffix__: aws
  1283. - __suffix__: gce
  1284. env: gce
  1285. frequency: manual
  1286. smoke_test:
  1287. frequency: manual
  1288. run:
  1289. timeout: 3600
  1290. cluster:
  1291. cluster_compute: tpl_cpu_1_large_gce.yaml
  1292. - name: long_running_many_actor_tasks
  1293. group: Long running tests
  1294. working_dir: long_running_tests
  1295. frequency: weekly
  1296. team: core
  1297. cluster:
  1298. byod:
  1299. pip:
  1300. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1301. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1302. runtime_env:
  1303. - RLLIB_TEST_NO_JAX_IMPORT=1
  1304. cluster_compute: tpl_cpu_1.yaml
  1305. run:
  1306. timeout: 86400
  1307. script: python workloads/many_actor_tasks.py
  1308. long_running: true
  1309. smoke_test:
  1310. frequency: nightly
  1311. run:
  1312. timeout: 3600
  1313. alert: long_running_tests
  1314. variations:
  1315. - __suffix__: aws
  1316. - __suffix__: gce
  1317. env: gce
  1318. frequency: manual
  1319. smoke_test:
  1320. frequency: manual
  1321. run:
  1322. timeout: 3600
  1323. cluster:
  1324. cluster_compute: tpl_cpu_1_gce.yaml
  1325. - name: long_running_many_drivers
  1326. group: Long running tests
  1327. working_dir: long_running_tests
  1328. frequency: weekly
  1329. team: core
  1330. cluster:
  1331. byod:
  1332. pip:
  1333. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1334. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1335. runtime_env:
  1336. - RLLIB_TEST_NO_JAX_IMPORT=1
  1337. cluster_compute: tpl_cpu_4.yaml
  1338. run:
  1339. timeout: 86400
  1340. script: python workloads/many_drivers.py --iteration-num=4000
  1341. long_running: true
  1342. wait_for_nodes:
  1343. num_nodes: 4
  1344. smoke_test:
  1345. frequency: nightly
  1346. run:
  1347. timeout: 3600
  1348. alert: long_running_tests
  1349. variations:
  1350. - __suffix__: aws
  1351. - __suffix__: gce
  1352. env: gce
  1353. frequency: manual
  1354. smoke_test:
  1355. frequency: manual
  1356. run:
  1357. timeout: 3600
  1358. cluster:
  1359. cluster_compute: tpl_cpu_4_gce.yaml
  1360. - name: long_running_many_ppo
  1361. group: Long running tests
  1362. working_dir: long_running_tests
  1363. stable: false
  1364. frequency: weekly
  1365. team: ml
  1366. cluster:
  1367. byod:
  1368. type: gpu
  1369. post_build_script: byod_rllib_test.sh
  1370. runtime_env:
  1371. - RLLIB_TEST_NO_JAX_IMPORT=1
  1372. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  1373. cluster_compute: many_ppo.yaml
  1374. run:
  1375. timeout: 86400
  1376. script: python workloads/many_ppo.py
  1377. long_running: true
  1378. wait_for_nodes:
  1379. num_nodes: 1
  1380. smoke_test:
  1381. frequency: nightly
  1382. run:
  1383. timeout: 3600
  1384. alert: long_running_tests
  1385. variations:
  1386. - __suffix__: aws
  1387. - __suffix__: gce
  1388. env: gce
  1389. frequency: manual
  1390. smoke_test:
  1391. frequency: manual
  1392. run:
  1393. timeout: 3600
  1394. cluster:
  1395. cluster_compute: many_ppo_gce.yaml
  1396. - name: long_running_many_tasks
  1397. group: Long running tests
  1398. working_dir: long_running_tests
  1399. frequency: weekly
  1400. team: core
  1401. cluster:
  1402. byod:
  1403. pip:
  1404. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1405. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1406. runtime_env:
  1407. - RLLIB_TEST_NO_JAX_IMPORT=1
  1408. cluster_compute: tpl_cpu_1.yaml
  1409. run:
  1410. timeout: 86400
  1411. script: python workloads/many_tasks.py
  1412. long_running: true
  1413. smoke_test:
  1414. frequency: nightly
  1415. run:
  1416. timeout: 3600
  1417. alert: long_running_tests
  1418. variations:
  1419. - __suffix__: aws
  1420. - __suffix__: gce
  1421. env: gce
  1422. frequency: manual
  1423. smoke_test:
  1424. frequency: manual
  1425. run:
  1426. timeout: 3600
  1427. cluster:
  1428. cluster_compute: tpl_cpu_1_gce.yaml
  1429. - name: long_running_many_tasks_serialized_ids
  1430. group: Long running tests
  1431. working_dir: long_running_tests
  1432. frequency: weekly
  1433. team: core
  1434. cluster:
  1435. byod:
  1436. pip:
  1437. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1438. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1439. runtime_env:
  1440. - RLLIB_TEST_NO_JAX_IMPORT=1
  1441. cluster_compute: tpl_cpu_1.yaml
  1442. run:
  1443. timeout: 86400
  1444. script: python workloads/many_tasks_serialized_ids.py
  1445. long_running: true
  1446. smoke_test:
  1447. frequency: nightly
  1448. run:
  1449. timeout: 3600
  1450. alert: long_running_tests
  1451. variations:
  1452. - __suffix__: aws
  1453. - __suffix__: gce
  1454. env: gce
  1455. frequency: manual
  1456. smoke_test:
  1457. frequency: manual
  1458. run:
  1459. timeout: 3600
  1460. cluster:
  1461. cluster_compute: tpl_cpu_1_gce.yaml
  1462. - name: long_running_node_failures
  1463. group: Long running tests
  1464. working_dir: long_running_tests
  1465. frequency: weekly
  1466. team: core
  1467. cluster:
  1468. byod:
  1469. pip:
  1470. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1471. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1472. runtime_env:
  1473. - RLLIB_TEST_NO_JAX_IMPORT=1
  1474. cluster_compute: tpl_cpu_1.yaml
  1475. run:
  1476. timeout: 86400
  1477. script: python workloads/node_failures.py
  1478. long_running: true
  1479. smoke_test:
  1480. frequency: nightly
  1481. run:
  1482. timeout: 3600
  1483. alert: long_running_tests
  1484. variations:
  1485. - __suffix__: aws
  1486. - __suffix__: gce
  1487. env: gce
  1488. frequency: manual
  1489. smoke_test:
  1490. frequency: manual
  1491. run:
  1492. timeout: 3600
  1493. cluster:
  1494. cluster_compute: tpl_cpu_1_gce.yaml
  1495. - name: long_running_serve
  1496. group: Long running tests
  1497. working_dir: long_running_tests
  1498. frequency: weekly
  1499. team: serve
  1500. cluster:
  1501. byod:
  1502. pip:
  1503. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1504. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1505. runtime_env:
  1506. - RLLIB_TEST_NO_JAX_IMPORT=1
  1507. cluster_compute: tpl_cpu_1_large.yaml
  1508. run:
  1509. timeout: 86400
  1510. script: python workloads/serve.py
  1511. long_running: true
  1512. smoke_test:
  1513. frequency: nightly
  1514. run:
  1515. timeout: 3600
  1516. alert: long_running_tests
  1517. variations:
  1518. - __suffix__: aws
  1519. - __suffix__: gce
  1520. env: gce
  1521. frequency: manual
  1522. smoke_test:
  1523. frequency: manual
  1524. run:
  1525. timeout: 3600
  1526. cluster:
  1527. cluster_compute: tpl_cpu_1_gce.yaml
  1528. - name: long_running_serve_failure
  1529. group: Long running tests
  1530. working_dir: long_running_tests
  1531. stable: true
  1532. frequency: weekly
  1533. team: serve
  1534. cluster:
  1535. byod:
  1536. pip:
  1537. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1538. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1539. runtime_env:
  1540. - RLLIB_TEST_NO_JAX_IMPORT=1
  1541. cluster_compute: tpl_cpu_1_c5.yaml
  1542. run:
  1543. timeout: 86400
  1544. script: python workloads/serve_failure.py
  1545. long_running: true
  1546. smoke_test:
  1547. frequency: nightly
  1548. run:
  1549. timeout: 600
  1550. alert: long_running_tests
  1551. variations:
  1552. - __suffix__: aws
  1553. - __suffix__: gce
  1554. env: gce
  1555. frequency: manual
  1556. smoke_test:
  1557. frequency: manual
  1558. run:
  1559. timeout: 86400
  1560. cluster:
  1561. cluster_compute: tpl_cpu_1_c5_gce.yaml
  1562. - name: long_running_many_jobs
  1563. group: Long running tests
  1564. working_dir: long_running_tests
  1565. stable: true
  1566. frequency: weekly
  1567. team: serve
  1568. cluster:
  1569. byod:
  1570. pip:
  1571. # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48
  1572. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
  1573. runtime_env:
  1574. - RLLIB_TEST_NO_JAX_IMPORT=1
  1575. cluster_compute: tpl_cpu_1.yaml
  1576. run:
  1577. timeout: 86400
  1578. script: python workloads/long_running_many_jobs.py --num-clients=1
  1579. long_running: true
  1580. smoke_test:
  1581. frequency: nightly
  1582. run:
  1583. timeout: 1800
  1584. alert: long_running_tests
  1585. variations:
  1586. - __suffix__: aws
  1587. - __suffix__: gce
  1588. env: gce
  1589. frequency: manual
  1590. smoke_test:
  1591. frequency: manual
  1592. run:
  1593. timeout: 3600
  1594. cluster:
  1595. cluster_compute: tpl_cpu_1_gce.yaml
  1596. - name: long_running_distributed_pytorch_pbt_failure
  1597. group: Long running tests
  1598. working_dir: long_running_distributed_tests
  1599. frequency: weekly
  1600. team: ml
  1601. cluster:
  1602. byod:
  1603. type: gpu
  1604. cluster_compute: compute_tpl.yaml
  1605. run:
  1606. timeout: 86400
  1607. script: python workloads/pytorch_pbt_failure.py
  1608. long_running: true
  1609. smoke_test:
  1610. frequency: manual
  1611. run:
  1612. timeout: 3600
  1613. alert: long_running_tests
  1614. variations:
  1615. - __suffix__: aws
  1616. - __suffix__: gce
  1617. env: gce
  1618. frequency: manual
  1619. smoke_test:
  1620. frequency: manual
  1621. run:
  1622. timeout: 3600
  1623. cluster:
  1624. cluster_compute: compute_tpl_gce.yaml
  1625. ########################
  1626. # Jobs tests
  1627. ########################
  1628. - name: jobs_basic_local_working_dir
  1629. group: Jobs tests
  1630. working_dir: jobs_tests
  1631. frequency: nightly
  1632. team: serve
  1633. cluster:
  1634. byod:
  1635. type: gpu
  1636. cluster_compute: compute_tpl_4_xlarge.yaml
  1637. run:
  1638. timeout: 600
  1639. script: python workloads/jobs_basic.py --working-dir "workloads"
  1640. wait_for_nodes:
  1641. num_nodes: 4
  1642. alert: default
  1643. variations:
  1644. - __suffix__: aws
  1645. - __suffix__: gce
  1646. env: gce
  1647. frequency: manual
  1648. cluster:
  1649. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  1650. - name: jobs_basic_remote_working_dir
  1651. group: Jobs tests
  1652. working_dir: jobs_tests
  1653. frequency: nightly
  1654. team: serve
  1655. cluster:
  1656. byod:
  1657. type: gpu
  1658. cluster_compute: compute_tpl_4_xlarge.yaml
  1659. run:
  1660. timeout: 600
  1661. script: python workloads/jobs_basic.py --working-dir "https://github.com/anyscale/job-services-cuj-examples/archive/refs/heads/main.zip"
  1662. wait_for_nodes:
  1663. num_nodes: 4
  1664. alert: default
  1665. variations:
  1666. - __suffix__: aws
  1667. - __suffix__: gce
  1668. env: gce
  1669. frequency: manual
  1670. cluster:
  1671. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  1672. - name: jobs_remote_multi_node
  1673. group: Jobs tests
  1674. team: serve
  1675. frequency: nightly
  1676. working_dir: jobs_tests
  1677. cluster:
  1678. byod:
  1679. type: gpu
  1680. cluster_compute: compute_tpl_4_xlarge.yaml
  1681. run:
  1682. timeout: 600
  1683. script: python workloads/jobs_remote_multi_node.py
  1684. wait_for_nodes:
  1685. num_nodes: 4
  1686. variations:
  1687. - __suffix__: aws
  1688. - __suffix__: gce
  1689. env: gce
  1690. frequency: manual
  1691. cluster:
  1692. cluster_compute: compute_tpl_gce_4_xlarge.yaml
  1693. - name: jobs_check_cuda_available
  1694. group: Jobs tests
  1695. team: serve
  1696. frequency: nightly
  1697. working_dir: jobs_tests
  1698. cluster:
  1699. byod:
  1700. type: gpu
  1701. cluster_compute: compute_tpl_gpu_node.yaml
  1702. run:
  1703. timeout: 600
  1704. script: python workloads/jobs_check_cuda_available.py
  1705. wait_for_nodes:
  1706. num_nodes: 2
  1707. variations:
  1708. - __suffix__: aws
  1709. - __suffix__: gce
  1710. env: gce
  1711. frequency: manual
  1712. cluster:
  1713. cluster_compute: compute_tpl_gce_gpu_node.yaml
  1714. - name: jobs_specify_num_gpus
  1715. group: Jobs tests
  1716. team: serve
  1717. frequency: nightly
  1718. working_dir: jobs_tests
  1719. cluster:
  1720. byod:
  1721. type: gpu
  1722. cluster_compute: compute_tpl_gpu_worker.yaml
  1723. run:
  1724. timeout: 600
  1725. script: python workloads/jobs_specify_num_gpus.py --working-dir "workloads"
  1726. wait_for_nodes:
  1727. num_nodes: 2
  1728. variations:
  1729. - __suffix__: aws
  1730. - __suffix__: gce
  1731. env: gce
  1732. frequency: manual
  1733. cluster:
  1734. cluster_compute: compute_tpl_gce_gpu_worker.yaml
  1735. ########################
  1736. # Runtime env tests
  1737. ########################
  1738. - name: runtime_env_rte_many_tasks_actors
  1739. group: Runtime env tests
  1740. working_dir: runtime_env_tests
  1741. frequency: nightly
  1742. team: core
  1743. cluster:
  1744. byod: {}
  1745. cluster_compute: rte_small.yaml
  1746. run:
  1747. timeout: 600
  1748. script: python workloads/rte_many_tasks_actors.py
  1749. wait_for_nodes:
  1750. num_nodes: 4
  1751. alert: default
  1752. variations:
  1753. - __suffix__: aws
  1754. - __suffix__: gce
  1755. env: gce
  1756. frequency: manual
  1757. cluster:
  1758. cluster_compute: rte_gce_small.yaml
  1759. - name: runtime_env_wheel_urls
  1760. group: Runtime env tests
  1761. working_dir: runtime_env_tests
  1762. frequency: nightly
  1763. team: core
  1764. cluster:
  1765. byod: {}
  1766. cluster_compute: rte_minimal.yaml
  1767. run:
  1768. timeout: 9000
  1769. script: python workloads/wheel_urls.py
  1770. wait_for_nodes:
  1771. num_nodes: 1
  1772. alert: default
  1773. variations:
  1774. - __suffix__: aws
  1775. - __suffix__: gce
  1776. env: gce
  1777. frequency: manual
  1778. cluster:
  1779. cluster_compute: rte_gce_minimal.yaml
  1780. # It seems like the consensus is that this should be tested in CI, and not in a nightly test.
  1781. # - name: runtime_env_rte_ray_client
  1782. # group: Runtime env tests
  1783. # working_dir: runtime_env_tests
  1784. # frequency: nightly
  1785. # team: core
  1786. # cluster:
  1787. # cluster_compute: rte_minimal.yaml
  1788. # run:
  1789. # timeout: 600
  1790. # script: python workloads/rte_ray_client.py
  1791. # wait_for_nodes:
  1792. # num_nodes: 1
  1793. # alert: default
  1794. ########################
  1795. # Serve tests
  1796. ########################
  1797. - name: serve_scale_replicas
  1798. group: Serve tests
  1799. working_dir: serve_tests
  1800. frequency: nightly
  1801. team: serve
  1802. cluster:
  1803. byod: {}
  1804. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  1805. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1806. run:
  1807. timeout: 7200
  1808. long_running: false
  1809. script: python workloads/replica_scalability.py
  1810. alert: default
  1811. variations:
  1812. - __suffix__: aws
  1813. - name: serve_multi_deployment_1k_noop_replica
  1814. group: Serve tests
  1815. working_dir: serve_tests
  1816. frequency: nightly
  1817. team: serve
  1818. cluster:
  1819. byod: {}
  1820. cluster_compute: compute_tpl_32_cpu.yaml
  1821. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1822. run:
  1823. timeout: 7200
  1824. long_running: false
  1825. script: python workloads/multi_deployment_1k_noop_replica.py
  1826. alert: default
  1827. variations:
  1828. - __suffix__: aws
  1829. - __suffix__: aws.py312
  1830. python: "3.12"
  1831. - __suffix__: gce
  1832. env: gce
  1833. frequency: manual
  1834. cluster:
  1835. cluster_compute: compute_tpl_32_cpu_gce.yaml
  1836. - name: serve_autoscaling_load_test
  1837. group: Serve tests
  1838. working_dir: serve_tests
  1839. frequency: nightly
  1840. team: serve
  1841. cluster:
  1842. byod:
  1843. type: gpu
  1844. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  1845. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1846. run:
  1847. timeout: 7200
  1848. long_running: false
  1849. script: python workloads/autoscaling_load_test.py
  1850. alert: default
  1851. variations:
  1852. - __suffix__: aws
  1853. - name: serve_microbenchmarks
  1854. group: Serve tests
  1855. working_dir: serve_tests
  1856. frequency: nightly
  1857. team: serve
  1858. cluster:
  1859. byod: {}
  1860. cluster_compute: compute_tpl_single_node_32_cpu.yaml
  1861. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1862. run:
  1863. timeout: 7200
  1864. long_running: false
  1865. script: python workloads/microbenchmarks.py --run-all
  1866. alert: default
  1867. variations:
  1868. - __suffix__: aws
  1869. - __suffix__: gce
  1870. env: gce
  1871. frequency: manual
  1872. cluster:
  1873. cluster_compute: compute_tpl_single_node_gce.yaml
  1874. - name: serve_resnet_benchmark
  1875. group: Serve tests
  1876. working_dir: serve_tests
  1877. frequency: nightly
  1878. team: serve
  1879. cluster:
  1880. byod:
  1881. type: gpu
  1882. cluster_compute: compute_tpl_gpu_node.yaml
  1883. cloud_id: cld_wy5a6nhazplvu32526ams61d98
  1884. run:
  1885. timeout: 7200
  1886. long_running: false
  1887. script: python workloads/serve_resnet_benchmark.py --gpu-env
  1888. alert: default
  1889. variations:
  1890. - __suffix__: aws
  1891. - __suffix__: gce
  1892. env: gce
  1893. frequency: manual
  1894. cluster:
  1895. cluster_compute: compute_tpl_gpu_node_gce.yaml
  1896. ########################
  1897. # Train tests
  1898. ########################
  1899. - name: train_horovod_multi_node_test
  1900. group: Train tests
  1901. working_dir: train_tests/horovod
  1902. frequency: nightly
  1903. team: ml
  1904. cluster:
  1905. byod:
  1906. type: gpu
  1907. post_build_script: byod_horovod_test.sh
  1908. cluster_compute: compute_tpl_aws.yaml
  1909. run:
  1910. timeout: 3000
  1911. script: python train_horovod_multi_node_test.py
  1912. wait_for_nodes:
  1913. num_nodes: 2
  1914. variations:
  1915. - __suffix__: aws
  1916. - __suffix__: gce
  1917. env: gce
  1918. frequency: manual
  1919. cluster:
  1920. cluster_compute: compute_tpl_gce.yaml
  1921. alert: default
  1922. - name: train_multinode_persistence
  1923. group: Train tests
  1924. working_dir: train_tests/multinode_persistence
  1925. frequency: nightly
  1926. team: ml
  1927. cluster:
  1928. byod:
  1929. post_build_script: byod_train_persistence_test.sh
  1930. cluster_compute: compute_aws.yaml
  1931. run:
  1932. timeout: 3000
  1933. script: pytest -v test_persistence.py -s
  1934. wait_for_nodes:
  1935. num_nodes: 4
  1936. variations:
  1937. - __suffix__: aws
  1938. - __suffix__: gce
  1939. env: gce
  1940. frequency: manual
  1941. cluster:
  1942. cluster_compute: compute_gce.yaml
  1943. alert: default
  1944. - name: train_colocate_trainer
  1945. group: Train tests
  1946. working_dir: train_tests/colocate_trainer
  1947. frequency: nightly
  1948. team: ml
  1949. cluster:
  1950. byod: {}
  1951. cluster_compute: compute_aws.yaml
  1952. run:
  1953. timeout: 3000
  1954. script: pytest -v test_colocate_trainer.py -s
  1955. wait_for_nodes:
  1956. num_nodes: 4
  1957. alert: default
  1958. - name: xgboost_train_batch_inference_benchmark_10G
  1959. group: Train tests
  1960. working_dir: train_tests/xgboost_lightgbm
  1961. frequency: nightly
  1962. team: ml
  1963. cluster:
  1964. byod:
  1965. type: gpu
  1966. cluster_compute: compute_aws_1worker.yaml
  1967. run:
  1968. timeout: 36000
  1969. script: python train_batch_inference_benchmark.py "xgboost" --size=10G
  1970. wait_for_nodes:
  1971. num_nodes: 2
  1972. variations:
  1973. - __suffix__: aws
  1974. - __suffix__: gce
  1975. env: gce
  1976. frequency: manual
  1977. cluster:
  1978. cluster_compute: compute_gce_1worker.yaml
  1979. smoke_test:
  1980. frequency: manual
  1981. run:
  1982. timeout: 1800
  1983. alert: default
  1984. - name: xgboost_train_batch_inference_benchmark_100G
  1985. group: Train tests
  1986. working_dir: train_tests/xgboost_lightgbm
  1987. frequency: nightly-3x
  1988. team: ml
  1989. cluster:
  1990. byod:
  1991. type: gpu
  1992. cluster_compute: compute_aws_10workers.yaml
  1993. run:
  1994. timeout: 36000
  1995. script: python train_batch_inference_benchmark.py "xgboost" --size=100G
  1996. wait_for_nodes:
  1997. num_nodes: 11
  1998. variations:
  1999. - __suffix__: aws
  2000. - __suffix__: gce
  2001. env: gce
  2002. frequency: manual
  2003. cluster:
  2004. cluster_compute: compute_gce_10workers.yaml
  2005. smoke_test:
  2006. frequency: manual
  2007. run:
  2008. timeout: 1800
  2009. alert: default
  2010. - name: lightgbm_train_batch_inference_benchmark_10G
  2011. group: Train tests
  2012. working_dir: train_tests/xgboost_lightgbm
  2013. frequency: nightly
  2014. team: ml
  2015. cluster:
  2016. byod:
  2017. type: gpu
  2018. cluster_compute: compute_aws_1worker.yaml
  2019. run:
  2020. timeout: 36000
  2021. script: python train_batch_inference_benchmark.py "lightgbm" --size=10G
  2022. wait_for_nodes:
  2023. num_nodes: 2
  2024. variations:
  2025. - __suffix__: aws
  2026. - __suffix__: gce
  2027. env: gce
  2028. frequency: manual
  2029. cluster:
  2030. cluster_compute: compute_gce_1worker.yaml
  2031. smoke_test:
  2032. frequency: manual
  2033. run:
  2034. timeout: 1800
  2035. alert: default
  2036. - name: lightgbm_train_batch_inference_benchmark_100G
  2037. group: Train tests
  2038. working_dir: train_tests/xgboost_lightgbm
  2039. frequency: nightly-3x
  2040. team: ml
  2041. cluster:
  2042. byod:
  2043. type: gpu
  2044. cluster_compute: compute_aws_10workers.yaml
  2045. run:
  2046. timeout: 36000
  2047. script: python train_batch_inference_benchmark.py "lightgbm" --size=100G
  2048. wait_for_nodes:
  2049. num_nodes: 11
  2050. variations:
  2051. - __suffix__: aws
  2052. - __suffix__: gce
  2053. env: gce
  2054. frequency: manual
  2055. cluster:
  2056. cluster_compute: compute_gce_10workers.yaml
  2057. smoke_test:
  2058. frequency: manual
  2059. run:
  2060. timeout: 1800
  2061. alert: default
  2062. ########################
  2063. # RLlib tests
  2064. ########################
  2065. # ----------------------------------------------------------
  2066. # Checkpointing with RLModule and Learner APIs
  2067. # ----------------------------------------------------------
  2068. - name: rllib_learner_group_checkpointing_multinode
  2069. group: RLlib tests
  2070. working_dir: rllib_tests
  2071. frequency: nightly
  2072. team: rllib
  2073. stable: False
  2074. cluster:
  2075. byod:
  2076. type: gpu
  2077. post_build_script: byod_rllib_test.sh
  2078. runtime_env:
  2079. - RLLIB_TEST_NO_JAX_IMPORT=1
  2080. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2081. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2082. run:
  2083. timeout: 3600
  2084. script: pytest checkpointing_tests/test_learner_group_checkpointing.py
  2085. wait_for_nodes:
  2086. num_nodes: 2
  2087. alert: default
  2088. variations:
  2089. - __suffix__: aws
  2090. - __suffix__: gce
  2091. env: gce
  2092. frequency: manual
  2093. cluster:
  2094. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2095. - name: rllib_learner_e2e_module_loading
  2096. group: RLlib tests
  2097. working_dir: rllib_tests
  2098. stable: false
  2099. frequency: nightly
  2100. team: rllib
  2101. cluster:
  2102. byod:
  2103. type: gpu
  2104. post_build_script: byod_rllib_test.sh
  2105. runtime_env:
  2106. - RLLIB_TEST_NO_JAX_IMPORT=1
  2107. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2108. cluster_compute: multi_node_checkpointing_compute_config.yaml
  2109. run:
  2110. timeout: 3600
  2111. script: pytest checkpointing_tests/test_e2e_rl_module_restore.py
  2112. wait_for_nodes:
  2113. num_nodes: 2
  2114. alert: default
  2115. variations:
  2116. - __suffix__: aws
  2117. - __suffix__: gce
  2118. env: gce
  2119. frequency: manual
  2120. cluster:
  2121. cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
  2122. # ----------------------------------------------------------
  2123. # Learning and benchmarking tests
  2124. # ----------------------------------------------------------
  2125. # --------------------------
  2126. # DreamerV3
  2127. # --------------------------
  2128. # TODO (sven): Move algo and this test to pytorch
  2129. - name: rllib_learning_tests_pong_dreamerv3_tf2
  2130. group: RLlib tests
  2131. working_dir: rllib_tests
  2132. stable: false
  2133. frequency: weekly
  2134. team: rllib
  2135. cluster:
  2136. byod:
  2137. type: gpu
  2138. post_build_script: byod_rllib_dreamerv3_test.sh
  2139. runtime_env:
  2140. - RLLIB_TEST_NO_JAX_IMPORT=1
  2141. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2142. cluster_compute: 1gpu_4cpus.yaml
  2143. run:
  2144. timeout: 43200 # 12h
  2145. script: python learning_tests/tuned_examples/dreamerv3/atari_100k.py --framework=tf2 --env=ALE/Pong-v5 --num-gpus=1 --stop-reward=15.0 --as-release-test
  2146. alert: default
  2147. variations:
  2148. - __suffix__: aws
  2149. - __suffix__: gce
  2150. env: gce
  2151. frequency: manual
  2152. cluster:
  2153. cluster_compute: 1gpu_4cpus_gce.yaml
  2154. # --------------------------
  2155. # PPO
  2156. # --------------------------
  2157. - name: rllib_learning_tests_pong_ppo_torch
  2158. group: RLlib tests
  2159. working_dir: rllib_tests
  2160. stable: true
  2161. frequency: nightly
  2162. team: rllib
  2163. cluster:
  2164. byod:
  2165. type: gpu
  2166. post_build_script: byod_rllib_test.sh
  2167. runtime_env:
  2168. - RLLIB_TEST_NO_JAX_IMPORT=1
  2169. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2170. cluster_compute: 8gpus_96cpus.yaml
  2171. run:
  2172. timeout: 1200
  2173. script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env=ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --stop-reward=20.0 --as-release-test
  2174. alert: default
  2175. variations:
  2176. - __suffix__: aws
  2177. - __suffix__: gce
  2178. env: gce
  2179. frequency: manual
  2180. cluster:
  2181. cluster_compute: 8gpus_96cpus_gce.yaml
  2182. # --------------------------
  2183. # SAC
  2184. # --------------------------
  2185. - name: rllib_learning_tests_halfcheetah_sac_torch
  2186. group: RLlib tests
  2187. working_dir: rllib_tests
  2188. stable: true
  2189. frequency: nightly
  2190. team: rllib
  2191. cluster:
  2192. byod:
  2193. type: gpu
  2194. post_build_script: byod_rllib_test.sh
  2195. runtime_env:
  2196. - RLLIB_TEST_NO_JAX_IMPORT=1
  2197. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  2198. cluster_compute: 4gpus_64cpus.yaml
  2199. run:
  2200. timeout: 7200
  2201. script: python learning_tests/tuned_examples/sac/halfcheetah_sac.py --enable-new-api-stack --num-gpus=4 --num-env-runners=8 --stop-reward=1000.0 --as-release-test
  2202. alert: default
  2203. variations:
  2204. - __suffix__: aws
  2205. - __suffix__: gce
  2206. env: gce
  2207. frequency: manual
  2208. cluster:
  2209. cluster_compute: 4gpus_64cpus_gce.yaml
  2210. ########################
  2211. # Core Nightly Tests
  2212. ########################
  2213. - name: shuffle_100gb
  2214. group: core-multi-test
  2215. working_dir: nightly_tests
  2216. frequency: nightly
  2217. team: core
  2218. cluster:
  2219. byod:
  2220. runtime_env:
  2221. - RAY_worker_killing_policy=retriable_lifo
  2222. cluster_compute: shuffle/shuffle_compute_multi.yaml
  2223. run:
  2224. timeout: 3000
  2225. script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
  2226. wait_for_nodes:
  2227. num_nodes: 4
  2228. variations:
  2229. - __suffix__: aws
  2230. - __suffix__: gce
  2231. env: gce
  2232. frequency: manual
  2233. cluster:
  2234. cluster_compute: shuffle/shuffle_compute_multi_gce.yaml
  2235. - name: stress_test_placement_group
  2236. group: core-multi-test
  2237. working_dir: nightly_tests
  2238. env: aws_perf
  2239. frequency: nightly
  2240. team: core
  2241. cluster:
  2242. byod: {}
  2243. cluster_compute: stress_tests/placement_group_tests_compute.yaml
  2244. run:
  2245. timeout: 7200
  2246. script: python stress_tests/test_placement_group.py
  2247. variations:
  2248. - __suffix__: aws
  2249. - __suffix__: gce
  2250. env: gce
  2251. frequency: manual
  2252. cluster:
  2253. cluster_compute: stress_tests/placement_group_tests_compute_gce.yaml
  2254. - name: decision_tree_autoscaling_20_runs
  2255. group: core-multi-test
  2256. working_dir: nightly_tests
  2257. frequency: nightly
  2258. team: core
  2259. cluster:
  2260. byod: {}
  2261. cluster_compute: decision_tree/autoscaling_compute.yaml
  2262. run:
  2263. timeout: 9600
  2264. script: python decision_tree/cart_with_tree.py --concurrency=20
  2265. variations:
  2266. - __suffix__: aws
  2267. - __suffix__: gce
  2268. env: gce
  2269. frequency: manual
  2270. cluster:
  2271. cluster_compute: decision_tree/autoscaling_compute_gce.yaml
  2272. - name: autoscaling_shuffle_1tb_1000_partitions
  2273. group: core-multi-test
  2274. working_dir: nightly_tests
  2275. frequency: nightly
  2276. team: core
  2277. cluster:
  2278. byod:
  2279. runtime_env:
  2280. - RAY_worker_killing_policy=retriable_lifo
  2281. cluster_compute: shuffle/shuffle_compute_autoscaling.yaml
  2282. run:
  2283. timeout: 4000
  2284. script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
  2285. --no-streaming
  2286. variations:
  2287. - __suffix__: aws
  2288. - __suffix__: gce
  2289. env: gce
  2290. frequency: manual
  2291. cluster:
  2292. cluster_compute: shuffle/shuffle_compute_autoscaling_gce.yaml
  2293. - name: microbenchmark
  2294. group: core-daily-test
  2295. team: core
  2296. frequency: nightly
  2297. env: aws_perf
  2298. working_dir: microbenchmark
  2299. cluster:
  2300. byod: {}
  2301. cluster_compute: tpl_64.yaml
  2302. run:
  2303. timeout: 1800
  2304. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
  2305. variations:
  2306. - __suffix__: aws
  2307. repeated_run: 5
  2308. - __suffix__: gce
  2309. env: gce
  2310. frequency: manual
  2311. cluster:
  2312. cluster_compute: tpl_64_gce.yaml
  2313. - __suffix__: aws.py312
  2314. frequency: weekly
  2315. python: "3.12"
  2316. - name: microbenchmark_unstable
  2317. group: core-daily-test
  2318. team: core
  2319. frequency: nightly
  2320. working_dir: microbenchmark
  2321. stable: false
  2322. cluster:
  2323. byod: {}
  2324. cluster_compute: tpl_64.yaml
  2325. run:
  2326. timeout: 1800
  2327. script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py --experimental
  2328. - name: microbenchmark_gpu_unstable
  2329. group: core-daily-test
  2330. team: core
  2331. frequency: nightly
  2332. working_dir: microbenchmark
  2333. stable: false
  2334. cluster:
  2335. byod:
  2336. type: gpu
  2337. cluster_compute: experimental/compute_gpu_2_aws.yaml
  2338. run:
  2339. timeout: 1800
  2340. script: python experimental/accelerated_dag_gpu_microbenchmark.py
  2341. - name: microbenchmark_gpu_multinode_unstable
  2342. group: core-daily-test
  2343. team: core
  2344. frequency: nightly
  2345. working_dir: microbenchmark
  2346. stable: false
  2347. cluster:
  2348. byod:
  2349. type: gpu
  2350. cluster_compute: experimental/compute_gpu_2x1_aws.yaml
  2351. run:
  2352. timeout: 1800
  2353. script: python experimental/accelerated_dag_gpu_microbenchmark.py --distributed
  2354. - name: benchmark_worker_startup
  2355. group: core-daily-test
  2356. team: core
  2357. frequency: nightly
  2358. working_dir: benchmark-worker-startup
  2359. stable: false
  2360. cluster:
  2361. byod:
  2362. type: gpu
  2363. cluster_compute: only_head_node_1gpu_64cpu.yaml
  2364. run:
  2365. timeout: 7200
  2366. script: python benchmark_worker_startup.py
  2367. --num_cpus_in_cluster 64
  2368. --num_gpus_in_cluster 64
  2369. --num_tasks_or_actors_per_run 64
  2370. --num_measurements_per_configuration 5
  2371. variations:
  2372. - __suffix__: aws
  2373. - __suffix__: gce
  2374. env: gce
  2375. frequency: manual
  2376. cluster:
  2377. cluster_compute: only_head_node_1gpu_64cpu_gce.yaml
  2378. - name: dask_on_ray_100gb_sort
  2379. group: core-daily-test
  2380. working_dir: nightly_tests
  2381. frequency: nightly
  2382. team: core
  2383. # https://github.com/ray-project/ray/issues/39165
  2384. stable: false
  2385. cluster:
  2386. byod:
  2387. runtime_env:
  2388. - RAY_worker_killing_policy=retriable_lifo
  2389. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml
  2390. run:
  2391. timeout: 7200
  2392. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions
  2393. 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
  2394. variations:
  2395. - __suffix__: aws
  2396. - __suffix__: gce
  2397. env: gce
  2398. frequency: manual
  2399. cluster:
  2400. cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template_gce.yaml
  2401. - name: dask_on_ray_large_scale_test_spilling
  2402. group: core-daily-test
  2403. working_dir: nightly_tests
  2404. frequency: nightly
  2405. team: data
  2406. cluster:
  2407. byod:
  2408. runtime_env:
  2409. - RAY_worker_killing_policy=retriable_lifo
  2410. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  2411. run:
  2412. timeout: 7200
  2413. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  2414. 70 --error_rate 0 --data_save_path /tmp/ray
  2415. wait_for_nodes:
  2416. num_nodes: 21
  2417. smoke_test:
  2418. frequency: nightly
  2419. cluster:
  2420. cluster_compute: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
  2421. run:
  2422. timeout: 7200
  2423. script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb
  2424. 70 --error_rate 0 --data_save_path /tmp/ray
  2425. wait_for_nodes:
  2426. num_nodes: 5
  2427. - name: stress_test_state_api_scale
  2428. group: core-daily-test
  2429. working_dir: nightly_tests
  2430. stable: false
  2431. frequency: nightly
  2432. team: core
  2433. cluster:
  2434. byod:
  2435. runtime_env:
  2436. - RAY_MAX_LIMIT_FROM_API_SERVER=1000000000
  2437. - RAY_MAX_LIMIT_FROM_DATA_SOURCE=1000000000
  2438. cluster_compute: stress_tests/stress_tests_compute_large.yaml
  2439. run:
  2440. timeout: 4200
  2441. script: python stress_tests/test_state_api_scale.py
  2442. wait_for_nodes:
  2443. num_nodes: 7
  2444. smoke_test:
  2445. frequency: nightly
  2446. cluster:
  2447. app_config: stress_tests/state_api_app_config.yaml
  2448. cluster_compute: stress_tests/smoke_test_compute.yaml
  2449. run:
  2450. timeout: 3600
  2451. wait_for_nodes:
  2452. num_nodes: 5
  2453. script: python stress_tests/test_state_api_scale.py --smoke-test
  2454. variations:
  2455. - __suffix__: aws
  2456. - __suffix__: aws.py312
  2457. frequency: manual
  2458. python: "3.12"
  2459. smoke_test:
  2460. frequency: nightly-3x
  2461. - __suffix__: gce
  2462. env: gce
  2463. frequency: manual
  2464. cluster:
  2465. cluster_compute: stress_tests/stress_tests_compute_large_gce.yaml
  2466. smoke_test:
  2467. frequency: manual
  2468. - name: shuffle_20gb_with_state_api
  2469. group: core-daily-test
  2470. working_dir: nightly_tests
  2471. frequency: nightly
  2472. team: core
  2473. cluster:
  2474. byod:
  2475. runtime_env:
  2476. - RAY_MAX_LIMIT_FROM_API_SERVER=1000000000
  2477. - RAY_MAX_LIMIT_FROM_DATA_SOURCE=1000000000
  2478. cluster_compute: shuffle/shuffle_compute_single.yaml
  2479. run:
  2480. timeout: 1000
  2481. script: python stress_tests/test_state_api_with_other_tests.py
  2482. nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
  2483. variations:
  2484. - __suffix__: aws
  2485. - __suffix__: gce
  2486. env: gce
  2487. frequency: manual
  2488. cluster:
  2489. cluster_compute: shuffle/shuffle_compute_single_gce.yaml
  2490. - name: stress_test_many_tasks
  2491. group: core-daily-test
  2492. working_dir: nightly_tests
  2493. env: aws_perf
  2494. frequency: nightly
  2495. team: core
  2496. cluster:
  2497. byod: {}
  2498. cluster_compute: stress_tests/stress_tests_compute.yaml
  2499. run:
  2500. timeout: 14400
  2501. wait_for_nodes:
  2502. num_nodes: 101
  2503. script: python stress_tests/test_many_tasks.py
  2504. smoke_test:
  2505. frequency: nightly
  2506. cluster:
  2507. app_config: stress_tests/stress_tests_app_config.yaml
  2508. cluster_compute: stress_tests/smoke_test_compute.yaml
  2509. run:
  2510. timeout: 3600
  2511. wait_for_nodes:
  2512. num_nodes: 5
  2513. script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
  2514. variations:
  2515. - __suffix__: aws
  2516. - __suffix__: gce
  2517. env: gce
  2518. frequency: manual
  2519. cluster:
  2520. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  2521. smoke_test:
  2522. frequency: manual
  2523. - name: stress_test_dead_actors
  2524. group: core-daily-test
  2525. working_dir: nightly_tests
  2526. env: aws_perf
  2527. frequency: nightly
  2528. team: core
  2529. cluster:
  2530. byod: {}
  2531. cluster_compute: stress_tests/stress_tests_compute.yaml
  2532. run:
  2533. timeout: 7200
  2534. wait_for_nodes:
  2535. num_nodes: 101
  2536. script: python stress_tests/test_dead_actors.py
  2537. smoke_test:
  2538. frequency: nightly
  2539. cluster:
  2540. app_config: stress_tests/stress_tests_app_config.yaml
  2541. cluster_compute: stress_tests/smoke_test_compute.yaml
  2542. run:
  2543. timeout: 3600
  2544. wait_for_nodes:
  2545. num_nodes: 5
  2546. script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3
  2547. --num-children=3
  2548. variations:
  2549. - __suffix__: aws
  2550. - __suffix__: gce
  2551. env: gce
  2552. frequency: manual
  2553. cluster:
  2554. cluster_compute: stress_tests/stress_tests_compute_gce.yaml
  2555. smoke_test:
  2556. frequency: manual
  2557. # The full test is not stable, so run the smoke test only.
  2558. # See https://github.com/ray-project/ray/issues/23244.
  2559. - name: threaded_actors_stress_test
  2560. group: core-daily-test
  2561. working_dir: nightly_tests
  2562. frequency: nightly
  2563. team: core
  2564. cluster:
  2565. byod: {}
  2566. cluster_compute: stress_tests/smoke_test_compute.yaml
  2567. run:
  2568. timeout: 3600
  2569. script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  2570. 30
  2571. wait_for_nodes:
  2572. num_nodes: 5
  2573. variations:
  2574. - __suffix__: aws
  2575. - __suffix__: gce
  2576. env: gce
  2577. frequency: manual
  2578. cluster:
  2579. cluster_compute: stress_tests/smoke_test_compute_gce.yaml
  2580. # - name: threaded_actors_stress_test
  2581. # group: core-daily-test
  2582. # working_dir: nightly_tests
  2583. #
  2584. # frequency: nightly
  2585. # team: core
  2586. # cluster:
  2587. # cluster_compute: stress_tests/stress_test_threaded_actor_compute.yaml
  2588. #
  2589. # run:
  2590. # timeout: 7200
  2591. # script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s
  2592. # 60
  2593. #
  2594. # wait_for_nodes:
  2595. # num_nodes: 201
  2596. # timeout: 600
  2597. #
  2598. # smoke_test:
  2599. # frequency: nightly
  2600. # cluster:
  2601. # app_config: stress_tests/stress_tests_app_config.yaml
  2602. # cluster_compute: stress_tests/smoke_test_compute.yaml
  2603. #
  2604. # run:
  2605. # timeout: 3600
  2606. # script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
  2607. # 30
  2608. #
  2609. # wait_for_nodes:
  2610. # num_nodes: 5
  2611. # timeout: 600
  2612. - name: stress_test_many_runtime_envs
  2613. group: core-daily-test
  2614. working_dir: nightly_tests
  2615. frequency: nightly
  2616. team: core
  2617. cluster:
  2618. byod: {}
  2619. cluster_compute: stress_tests/smoke_test_compute.yaml
  2620. run:
  2621. timeout: 14400
  2622. wait_for_nodes:
  2623. num_nodes: 5
  2624. script: python stress_tests/test_many_runtime_envs.py --num_runtime_envs=100 --num_tasks=10000
  2625. variations:
  2626. - __suffix__: aws
  2627. - __suffix__: gce
  2628. env: gce
  2629. frequency: manual
  2630. cluster:
  2631. cluster_compute: stress_tests/smoke_test_compute_gce.yaml
  2632. smoke_test:
  2633. frequency: manual
  2634. - name: single_node_oom
  2635. group: core-daily-test
  2636. working_dir: nightly_tests
  2637. # TODO: https://github.com/ray-project/ray/issues/47596
  2638. stable: false
  2639. frequency: nightly
  2640. team: core
  2641. env: aws_perf
  2642. cluster:
  2643. byod: {}
  2644. cluster_compute: stress_tests/stress_tests_single_node_oom_compute.yaml
  2645. run:
  2646. timeout: 1000
  2647. script: python stress_tests/test_parallel_tasks_memory_pressure.py --num-tasks 20
  2648. variations:
  2649. - __suffix__: aws
  2650. - __suffix__: gce
  2651. env: gce
  2652. frequency: manual
  2653. cluster:
  2654. cluster_compute: stress_tests/stress_tests_single_node_oom_compute_gce.yaml
  2655. - name: tune_air_oom
  2656. group: core-daily-test
  2657. working_dir: air_tests
  2658. stable: false
  2659. frequency: nightly
  2660. team: core
  2661. cluster:
  2662. byod:
  2663. runtime_env:
  2664. - RAY_memory_usage_threshold=0.7
  2665. - RAY_task_oom_retries=-1
  2666. cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml
  2667. run:
  2668. timeout: 3600
  2669. script: bash oom/tune_air_oom.sh
  2670. - name: dask_on_ray_1tb_sort
  2671. group: core-daily-test
  2672. working_dir: nightly_tests
  2673. frequency: nightly-3x
  2674. team: core
  2675. cluster:
  2676. byod:
  2677. runtime_env:
  2678. - RAY_worker_killing_policy=retriable_lifo
  2679. cluster_compute: dask_on_ray/1tb_sort_compute.yaml
  2680. run:
  2681. timeout: 7200
  2682. script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions
  2683. 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
  2684. wait_for_nodes:
  2685. num_nodes: 32
  2686. - name: many_nodes_actor_test_on_v2
  2687. group: core-daily-test
  2688. working_dir: benchmarks
  2689. frequency: nightly-3x
  2690. team: core
  2691. cluster:
  2692. byod: {}
  2693. cluster_compute: distributed/many_nodes_tests/compute_config.yaml
  2694. run:
  2695. timeout: 3600
  2696. # 2cpus per node x 1000 nodes / 0.2 cpus per actor = 10k
  2697. # 2cpus per node x 2000 nodes / 0.2 cpus per actor = 20k
  2698. script: python distributed/many_nodes_tests/actor_test.py --no-wait --cpus-per-actor=0.2 --total-actors 10000 20000
  2699. wait_for_nodes:
  2700. num_nodes: 500
  2701. variations:
  2702. - __suffix__: aws
  2703. - __suffix__: gce
  2704. env: gce
  2705. frequency: manual
  2706. cluster:
  2707. cluster_compute: distributed/many_nodes_tests/compute_config_gce.yaml
  2708. #- name: many_nodes_multi_master_test
  2709. # group: core-daily-test
  2710. # working_dir: nightly_tests
  2711. #
  2712. # frequency: nightly-3x
  2713. # team: core
  2714. # cluster:
  2715. # cluster_compute: many_nodes_tests/compute_config.yaml
  2716. #
  2717. # run:
  2718. # timeout: 7200
  2719. # script: python many_nodes_tests/multi_master_test.py
  2720. # wait_for_nodes:
  2721. # num_nodes: 251
  2722. #
  2723. - name: pg_autoscaling_regression_test
  2724. group: core-daily-test
  2725. working_dir: nightly_tests
  2726. frequency: nightly
  2727. team: core
  2728. cluster:
  2729. byod: {}
  2730. cluster_compute: placement_group_tests/compute.yaml
  2731. run:
  2732. timeout: 1200
  2733. script: python placement_group_tests/pg_run.py
  2734. variations:
  2735. - __suffix__: aws
  2736. - __suffix__: gce
  2737. env: gce
  2738. frequency: manual
  2739. cluster:
  2740. cluster_compute: placement_group_tests/compute_gce.yaml
  2741. - name: placement_group_performance_test
  2742. group: core-daily-test
  2743. working_dir: nightly_tests
  2744. frequency: nightly
  2745. team: core
  2746. cluster:
  2747. byod: {}
  2748. cluster_compute: placement_group_tests/pg_perf_test_compute.yaml
  2749. run:
  2750. timeout: 1200
  2751. script: python placement_group_tests/placement_group_performance_test.py
  2752. wait_for_nodes:
  2753. num_nodes: 5
  2754. variations:
  2755. - __suffix__: aws
  2756. - __suffix__: gce
  2757. env: gce
  2758. frequency: manual
  2759. cluster:
  2760. cluster_compute: placement_group_tests/pg_perf_test_compute_gce.yaml
  2761. #########################
  2762. # Core Scalability Tests
  2763. #########################
  2764. - name: single_node
  2765. group: core-scalability-test
  2766. working_dir: benchmarks
  2767. frequency: nightly
  2768. team: core
  2769. env: aws_perf
  2770. cluster:
  2771. byod:
  2772. type: gpu
  2773. runtime_env:
  2774. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2775. cluster_compute: single_node.yaml
  2776. run:
  2777. timeout: 12000
  2778. prepare: sleep 0
  2779. script: python single_node/test_single_node.py
  2780. variations:
  2781. - __suffix__: aws
  2782. - __suffix__: gce
  2783. env: gce
  2784. frequency: manual
  2785. cluster:
  2786. cluster_compute: single_node_gce.yaml
  2787. - name: object_store
  2788. group: core-scalability-test
  2789. working_dir: benchmarks
  2790. frequency: nightly
  2791. team: core
  2792. env: aws_perf
  2793. cluster:
  2794. byod:
  2795. type: gpu
  2796. runtime_env:
  2797. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2798. cluster_compute: object_store.yaml
  2799. run:
  2800. timeout: 3600
  2801. script: python object_store/test_object_store.py
  2802. wait_for_nodes:
  2803. num_nodes: 50
  2804. variations:
  2805. - __suffix__: aws
  2806. - __suffix__: gce
  2807. env: gce
  2808. frequency: manual
  2809. cluster:
  2810. cluster_compute: object_store_gce.yaml
  2811. - name: many_actors
  2812. group: core-scalability-test
  2813. working_dir: benchmarks
  2814. frequency: nightly-3x
  2815. team: core
  2816. env: aws_perf
  2817. cluster:
  2818. byod:
  2819. type: gpu
  2820. runtime_env:
  2821. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2822. cluster_compute: distributed.yaml
  2823. run:
  2824. timeout: 3600
  2825. script: python distributed/test_many_actors.py
  2826. wait_for_nodes:
  2827. num_nodes: 65
  2828. variations:
  2829. - __suffix__: aws
  2830. - __suffix__: gce
  2831. env: gce
  2832. frequency: manual
  2833. cluster:
  2834. cluster_compute: distributed_gce.yaml
  2835. - name: many_actors_smoke_test
  2836. group: core-scalability-test
  2837. working_dir: benchmarks
  2838. frequency: nightly
  2839. team: core
  2840. cluster:
  2841. byod:
  2842. type: gpu
  2843. runtime_env:
  2844. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2845. cluster_compute: distributed_smoke_test.yaml
  2846. run:
  2847. timeout: 3600
  2848. script: SMOKE_TEST=1 python distributed/test_many_actors.py
  2849. wait_for_nodes:
  2850. num_nodes: 2
  2851. - name: many_tasks
  2852. group: core-scalability-test
  2853. working_dir: benchmarks
  2854. frequency: nightly
  2855. team: core
  2856. env: aws_perf
  2857. cluster:
  2858. byod:
  2859. type: gpu
  2860. runtime_env:
  2861. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2862. cluster_compute: distributed.yaml
  2863. run:
  2864. timeout: 3600
  2865. script: python distributed/test_many_tasks.py --num-tasks=10000
  2866. wait_for_nodes:
  2867. num_nodes: 65
  2868. variations:
  2869. - __suffix__: aws
  2870. - __suffix__: gce
  2871. env: gce
  2872. frequency: manual
  2873. cluster:
  2874. cluster_compute: distributed_gce.yaml
  2875. - name: many_pgs
  2876. group: core-scalability-test
  2877. working_dir: benchmarks
  2878. frequency: nightly-3x
  2879. team: core
  2880. env: aws_perf
  2881. cluster:
  2882. byod:
  2883. type: gpu
  2884. runtime_env:
  2885. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2886. cluster_compute: distributed.yaml
  2887. run:
  2888. timeout: 3600
  2889. script: python distributed/test_many_pgs.py
  2890. wait_for_nodes:
  2891. num_nodes: 65
  2892. variations:
  2893. - __suffix__: aws
  2894. - __suffix__: gce
  2895. env: gce
  2896. frequency: manual
  2897. cluster:
  2898. cluster_compute: distributed_gce.yaml
  2899. - name: many_pgs_smoke_test
  2900. group: core-scalability-test
  2901. working_dir: benchmarks
  2902. frequency: nightly
  2903. team: core
  2904. cluster:
  2905. byod:
  2906. type: gpu
  2907. runtime_env:
  2908. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2909. cluster_compute: distributed_smoke_test.yaml
  2910. run:
  2911. timeout: 3600
  2912. script: SMOKE_TEST=1 python distributed/test_many_pgs.py
  2913. wait_for_nodes:
  2914. num_nodes: 2
  2915. - name: many_nodes
  2916. group: core-scalability-test
  2917. working_dir: benchmarks
  2918. frequency: nightly-3x
  2919. team: core
  2920. env: aws_perf
  2921. cluster:
  2922. byod:
  2923. type: gpu
  2924. runtime_env:
  2925. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2926. cluster_compute: many_nodes.yaml
  2927. run:
  2928. timeout: 3600
  2929. script: python distributed/test_many_tasks.py --num-tasks=1000
  2930. wait_for_nodes:
  2931. num_nodes: 250
  2932. variations:
  2933. - __suffix__: aws
  2934. - __suffix__: gce
  2935. env: gce
  2936. frequency: manual
  2937. cluster:
  2938. cluster_compute: many_nodes_gce.yaml
  2939. - name: scheduling_test_many_0s_tasks_many_nodes
  2940. group: core-scalability-test
  2941. working_dir: benchmarks
  2942. frequency: nightly
  2943. team: core
  2944. cluster:
  2945. byod:
  2946. type: gpu
  2947. runtime_env:
  2948. - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
  2949. cluster_compute: scheduling.yaml
  2950. run:
  2951. timeout: 3600
  2952. script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  2953. --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
  2954. wait_for_nodes:
  2955. num_nodes: 32
  2956. variations:
  2957. - __suffix__: aws
  2958. - __suffix__: gce
  2959. env: gce
  2960. frequency: manual
  2961. cluster:
  2962. cluster_compute: scheduling_gce.yaml
  2963. # - name: scheduling_test_many_5s_tasks_single_node
  2964. # group: core-scalability-test
  2965. # working_dir: benchmarks
  2966. # frequency: nightly
  2967. # team: core
  2968. # cluster:
  2969. # cluster_compute: scheduling.yaml
  2970. # run:
  2971. # timeout: 3600
  2972. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  2973. # --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
  2974. # wait_for_nodes:
  2975. # num_nodes: 32
  2976. # timeout: 600
  2977. # stable: false
  2978. # - name: scheduling_test_many_5s_tasks_many_nodes
  2979. # group: core-scalability-test
  2980. # working_dir: benchmarks
  2981. # frequency: nightly
  2982. # team: core
  2983. # cluster:
  2984. # cluster_compute: scheduling.yaml
  2985. # run:
  2986. # timeout: 3600
  2987. # script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
  2988. # --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
  2989. # wait_for_nodes:
  2990. # num_nodes: 32
  2991. # timeout: 600
  2992. # stable: false
  2993. ###############
  2994. # Dataset tests
  2995. ###############
  2996. - name: parquet_metadata_resolution
  2997. group: data-tests
  2998. working_dir: nightly_tests/dataset
  2999. frequency: nightly
  3000. team: data
  3001. cluster:
  3002. byod:
  3003. type: gpu
  3004. cluster_compute: single_node_benchmark_compute.yaml
  3005. run:
  3006. # Expect the test to finish around 40 seconds.
  3007. timeout: 100
  3008. script: python parquet_metadata_resolution.py --num-files 915 --cloud aws
  3009. variations:
  3010. - __suffix__: aws
  3011. - __suffix__: gce
  3012. env: gce
  3013. frequency: manual
  3014. cluster:
  3015. cluster_compute: single_node_benchmark_compute_gce.yaml
  3016. run:
  3017. script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp
  3018. - name: dataset_random_access
  3019. group: data-tests
  3020. working_dir: nightly_tests/dataset
  3021. stable: false
  3022. frequency: manual
  3023. team: data
  3024. cluster:
  3025. byod:
  3026. type: gpu
  3027. pip:
  3028. - git+https://github.com/ray-project/ray_shuffling_data_loader.git@add-embedding-model
  3029. cluster_compute: pipelined_training_compute.yaml
  3030. run:
  3031. timeout: 1200
  3032. script: python dataset_random_access.py
  3033. wait_for_nodes:
  3034. num_nodes: 15
  3035. variations:
  3036. - __suffix__: aws
  3037. - __suffix__: gce
  3038. env: gce
  3039. frequency: manual
  3040. cluster:
  3041. cluster_compute: pipelined_training_compute_gce.yaml
  3042. - name: stable_diffusion_benchmark
  3043. group: data-tests
  3044. working_dir: nightly_tests/dataset
  3045. frequency: nightly
  3046. team: data
  3047. cluster:
  3048. byod:
  3049. type: gpu
  3050. post_build_script: byod_stable_diffusion.sh
  3051. cluster_compute: stable_diffusion_benchmark_compute.yaml
  3052. run:
  3053. timeout: 1800
  3054. script: python stable_diffusion_benchmark.py
  3055. variations:
  3056. - __suffix__: aws
  3057. - __suffix__: gce
  3058. env: gce
  3059. frequency: manual
  3060. cluster:
  3061. cluster_compute: stable_diffusion_benchmark_compute_gce.yaml
  3062. - name: streaming_data_ingest_benchmark_1tb
  3063. group: data-tests
  3064. working_dir: nightly_tests/dataset
  3065. frequency: nightly
  3066. team: data
  3067. cluster:
  3068. byod:
  3069. type: gpu
  3070. cluster_compute: data_ingest_benchmark_compute.yaml
  3071. run:
  3072. timeout: 300
  3073. script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming
  3074. wait_for_nodes:
  3075. num_nodes: 20
  3076. variations:
  3077. - __suffix__: aws
  3078. - __suffix__: gce
  3079. env: gce
  3080. frequency: manual
  3081. cluster:
  3082. cluster_compute: data_ingest_benchmark_compute_gce.yaml
  3083. - name: streaming_data_ingest_benchmark_100gb_gpu
  3084. group: data-tests
  3085. working_dir: nightly_tests/dataset
  3086. frequency: nightly
  3087. team: data
  3088. cluster:
  3089. byod:
  3090. type: gpu
  3091. cluster_compute: data_ingest_benchmark_compute_gpu.yaml
  3092. run:
  3093. timeout: 300
  3094. script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu
  3095. wait_for_nodes:
  3096. num_nodes: 3
  3097. variations:
  3098. - __suffix__: aws
  3099. - __suffix__: gce
  3100. env: gce
  3101. frequency: manual
  3102. cluster:
  3103. cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
  3104. # This test case will early stop the data ingestion iteration on the GPU actors.
  3105. # This is a common usage in PyTorch Lightning
  3106. # (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches).
  3107. # There was a bug in Ray Data that caused GPU memoy leak (see #3.919).
  3108. # We add this test case to cover this scenario.
  3109. - name: streaming_data_ingest_benchmark_100gb_gpu_early_stop
  3110. group: data-tests
  3111. working_dir: nightly_tests/dataset
  3112. frequency: nightly
  3113. team: data
  3114. cluster:
  3115. byod:
  3116. type: gpu
  3117. cluster_compute: data_ingest_benchmark_compute_gpu.yaml
  3118. run:
  3119. timeout: 300
  3120. script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu --early-stop
  3121. wait_for_nodes:
  3122. num_nodes: 3
  3123. variations:
  3124. - __suffix__: aws
  3125. - __suffix__: gce
  3126. env: gce
  3127. frequency: manual
  3128. cluster:
  3129. cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
  3130. - name: aggregate_benchmark
  3131. group: data-tests
  3132. working_dir: nightly_tests/dataset
  3133. frequency: nightly
  3134. team: data
  3135. cluster:
  3136. byod:
  3137. type: gpu
  3138. cluster_compute: single_node_benchmark_compute.yaml
  3139. run:
  3140. timeout: 1800
  3141. script: python aggregate_benchmark.py
  3142. variations:
  3143. - __suffix__: aws
  3144. - __suffix__: gce
  3145. env: gce
  3146. frequency: manual
  3147. cluster:
  3148. cluster_compute: single_node_benchmark_compute_gce.yaml
  3149. - name: read_parquet_benchmark_single_node
  3150. group: data-tests
  3151. working_dir: nightly_tests/dataset
  3152. frequency: nightly
  3153. team: data
  3154. cluster:
  3155. byod:
  3156. type: gpu
  3157. post_build_script: byod_install_mosaicml.sh
  3158. cluster_compute: single_node_benchmark_compute.yaml
  3159. run:
  3160. # Expect the benchmark to finish in 400 seconds.
  3161. timeout: 400
  3162. script: python read_parquet_benchmark.py
  3163. variations:
  3164. - __suffix__: aws
  3165. - __suffix__: gce
  3166. env: gce
  3167. frequency: manual
  3168. cluster:
  3169. cluster_compute: single_node_benchmark_compute_gce.yaml
  3170. - name: read_images_benchmark_single_node
  3171. group: data-tests
  3172. working_dir: nightly_tests/dataset
  3173. frequency: nightly
  3174. team: data
  3175. cluster:
  3176. byod:
  3177. type: gpu
  3178. post_build_script: byod_install_mosaicml.sh
  3179. cluster_compute: single_node_benchmark_compute.yaml
  3180. run:
  3181. timeout: 1800
  3182. script: python read_images_benchmark.py --single-node
  3183. variations:
  3184. - __suffix__: aws
  3185. - __suffix__: gce
  3186. env: gce
  3187. frequency: manual
  3188. cluster:
  3189. cluster_compute: single_node_benchmark_compute_gce.yaml
  3190. # TODO: Re-enable this test once we fix https://github.com/ray-project/ray/issues/40686.
  3191. # - name: read_images_benchmark_multi_node
  3192. # group: data-tests
  3193. # working_dir: nightly_tests/dataset
  3194. # frequency: nightly-3x
  3195. # team: data
  3196. # cluster:
  3197. # byod:
  3198. # type: gpu
  3199. # cluster_compute: multi_node_read_images_benchmark_compute.yaml
  3200. # run:
  3201. # timeout: 28800
  3202. # script: python read_images_benchmark.py --multi-node
  3203. # variations:
  3204. # - __suffix__: aws
  3205. # - __suffix__: gce
  3206. # env: gce
  3207. # frequency: manual
  3208. # cluster:
  3209. # cluster_compute: multi_node_read_images_benchmark_compute_gce.yaml
  3210. - name: read_images_comparison_microbenchmark_single_node
  3211. group: data-tests
  3212. working_dir: nightly_tests/dataset
  3213. frequency: nightly
  3214. team: data
  3215. cluster:
  3216. byod:
  3217. type: gpu
  3218. post_build_script: byod_install_mosaicml.sh
  3219. cluster_compute: single_worker_node_0_head_node_benchmark_compute.yaml
  3220. run:
  3221. timeout: 1800
  3222. script: bash run_image_loader_microbenchmark.sh
  3223. variations:
  3224. - __suffix__: aws
  3225. - __suffix__: gce
  3226. env: gce
  3227. frequency: manual
  3228. cluster:
  3229. cluster_compute: single_node_benchmark_compute_gce.yaml
  3230. - name: read_images_train_4_gpu
  3231. group: data-tests
  3232. working_dir: nightly_tests/dataset
  3233. frequency: nightly
  3234. team: data
  3235. cluster:
  3236. byod:
  3237. type: gpu
  3238. post_build_script: byod_install_mosaicml.sh
  3239. cluster_compute: multi_node_train_4_workers.yaml
  3240. run:
  3241. timeout: 18000
  3242. script: python multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 2
  3243. variations:
  3244. - __suffix__: aws
  3245. - __suffix__: gce
  3246. env: gce
  3247. frequency: manual
  3248. cluster:
  3249. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3250. - name: read_images_train_4_gpu_worker_chaos
  3251. group: data-tests
  3252. working_dir: nightly_tests
  3253. frequency: nightly
  3254. team: data
  3255. cluster:
  3256. byod:
  3257. type: gpu
  3258. post_build_script: byod_install_mosaicml.sh
  3259. cluster_compute: dataset/multi_node_train_4_workers.yaml
  3260. run:
  3261. timeout: 18000
  3262. prepare: python setup_chaos.py --kill-workers --kill-interval 100 --max-to-kill 3 --task-names "ReadImage->Map(wnid_to_index)->Map(crop_and_flip_image)"
  3263. script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
  3264. variations:
  3265. - __suffix__: aws
  3266. - __suffix__: gce
  3267. env: gce
  3268. frequency: manual
  3269. cluster:
  3270. cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3271. - name: read_images_train_4_gpu_node_chaos
  3272. group: data-tests
  3273. working_dir: nightly_tests
  3274. frequency: nightly
  3275. team: data
  3276. cluster:
  3277. byod:
  3278. type: gpu
  3279. post_build_script: byod_install_mosaicml.sh
  3280. cluster_compute: dataset/multi_node_train_4_workers.yaml
  3281. run:
  3282. timeout: 18000
  3283. prepare: python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names "_RayTrainWorker__execute.get_next"
  3284. script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
  3285. variations:
  3286. - __suffix__: aws
  3287. - __suffix__: gce
  3288. env: gce
  3289. frequency: manual
  3290. cluster:
  3291. cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3292. - name: read_images_train_16_gpu
  3293. group: data-tests
  3294. working_dir: nightly_tests/dataset
  3295. frequency: nightly
  3296. team: data
  3297. cluster:
  3298. byod:
  3299. type: gpu
  3300. post_build_script: byod_install_mosaicml.sh
  3301. cluster_compute: multi_node_train_16_workers.yaml
  3302. run:
  3303. timeout: 18000
  3304. script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --use-gpu --num-epochs 2
  3305. variations:
  3306. - __suffix__: aws
  3307. - __suffix__: gce
  3308. env: gce
  3309. frequency: manual
  3310. cluster:
  3311. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
  3312. - name: read_images_train_16_gpu_preserve_order
  3313. group: data-tests
  3314. working_dir: nightly_tests/dataset
  3315. frequency: nightly
  3316. team: data
  3317. cluster:
  3318. byod:
  3319. type: gpu
  3320. post_build_script: byod_install_mosaicml.sh
  3321. cluster_compute: multi_node_train_16_workers.yaml
  3322. run:
  3323. timeout: 18000
  3324. script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --preserve-order --use-gpu --num-epochs 2
  3325. variations:
  3326. - __suffix__: aws
  3327. - __suffix__: gce
  3328. env: gce
  3329. frequency: manual
  3330. cluster:
  3331. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
  3332. - name: read_parquet_train_4_gpu
  3333. group: data-tests
  3334. working_dir: nightly_tests/dataset
  3335. frequency: nightly
  3336. team: data
  3337. cluster:
  3338. byod:
  3339. type: gpu
  3340. post_build_script: byod_install_mosaicml.sh
  3341. cluster_compute: multi_node_train_4_workers.yaml
  3342. run:
  3343. timeout: 3600
  3344. script: python multi_node_train_benchmark.py --num-workers 4 --file-type parquet --target-worker-gb 50 --use-gpu
  3345. variations:
  3346. - __suffix__: aws
  3347. - __suffix__: gce
  3348. env: gce
  3349. frequency: manual
  3350. cluster:
  3351. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
  3352. - name: read_parquet_train_16_gpu
  3353. group: data-tests
  3354. working_dir: nightly_tests/dataset
  3355. frequency: nightly
  3356. team: data
  3357. cluster:
  3358. byod:
  3359. type: gpu
  3360. post_build_script: byod_install_mosaicml.sh
  3361. cluster_compute: multi_node_train_16_workers.yaml
  3362. run:
  3363. timeout: 3600
  3364. script: python multi_node_train_benchmark.py --num-workers 16 --file-type parquet --target-worker-gb 50 --use-gpu
  3365. variations:
  3366. - __suffix__: aws
  3367. - __suffix__: gce
  3368. env: gce
  3369. frequency: manual
  3370. cluster:
  3371. cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
  3372. - name: read_images_train_1_gpu_5_cpu
  3373. group: data-tests
  3374. working_dir: nightly_tests/dataset
  3375. frequency: nightly
  3376. team: data
  3377. cluster:
  3378. byod:
  3379. type: gpu
  3380. post_build_script: byod_install_mosaicml.sh
  3381. cluster_compute: multi_node_train_1g5c.yaml
  3382. run:
  3383. timeout: 2400
  3384. script: python multi_node_train_benchmark.py --num-workers 1 --file-type image --use-gpu --num-epochs 2 --skip-train-model --prefetch-batches 16 --batch-size -1 --disable-locality-with-output
  3385. variations:
  3386. - __suffix__: aws
  3387. - __suffix__: gce
  3388. env: gce
  3389. frequency: manual
  3390. cluster:
  3391. cluster_compute: compute_gpu_1g5c_gce.yaml
  3392. - name: read_tfrecords_benchmark_single_node
  3393. group: data-tests
  3394. working_dir: nightly_tests/dataset
  3395. frequency: nightly
  3396. team: data
  3397. cluster:
  3398. byod:
  3399. type: gpu
  3400. post_build_script: byod_install_mosaicml.sh
  3401. cluster_compute: single_node_benchmark_compute.yaml
  3402. run:
  3403. # Expect the benchmark to finish around 22 minutes.
  3404. timeout: 1800
  3405. script: python read_tfrecords_benchmark.py
  3406. variations:
  3407. - __suffix__: aws
  3408. - __suffix__: gce
  3409. env: gce
  3410. frequency: manual
  3411. cluster:
  3412. cluster_compute: single_node_benchmark_compute_gce.yaml
  3413. - name: map_batches_benchmark_single_node
  3414. group: data-tests
  3415. working_dir: nightly_tests/dataset
  3416. frequency: nightly
  3417. team: data
  3418. cluster:
  3419. byod:
  3420. type: gpu
  3421. cluster_compute: single_node_benchmark_compute.yaml
  3422. run:
  3423. # Expect the benchmark to finish around 30 minutes.
  3424. timeout: 2400
  3425. script: python map_batches_benchmark.py
  3426. variations:
  3427. - __suffix__: aws
  3428. - __suffix__: gce
  3429. env: gce
  3430. frequency: manual
  3431. cluster:
  3432. cluster_compute: single_node_benchmark_compute_gce.yaml
  3433. - name: iter_tensor_batches_benchmark_single_node
  3434. group: data-tests
  3435. working_dir: nightly_tests/dataset
  3436. frequency: nightly
  3437. team: data
  3438. cluster:
  3439. byod:
  3440. type: gpu
  3441. cluster_compute: single_node_benchmark_compute.yaml
  3442. run:
  3443. # Expect the benchmark to finish around 30 minutes.
  3444. timeout: 2400
  3445. script: python iter_tensor_batches_benchmark.py
  3446. variations:
  3447. - __suffix__: aws
  3448. - __suffix__: gce
  3449. env: gce
  3450. frequency: manual
  3451. cluster:
  3452. cluster_compute: single_node_benchmark_compute_gce.yaml
  3453. - name: iter_tensor_batches_benchmark_multi_node
  3454. group: data-tests
  3455. working_dir: nightly_tests/dataset
  3456. frequency: nightly
  3457. team: data
  3458. cluster:
  3459. byod:
  3460. type: gpu
  3461. cluster_compute: multi_node_benchmark_compute.yaml
  3462. run:
  3463. # Expect the benchmark to finish within 90 minutes.
  3464. timeout: 5400
  3465. script: python iter_tensor_batches_benchmark.py --data-size-gb=10
  3466. variations:
  3467. - __suffix__: aws
  3468. - __suffix__: gce
  3469. env: gce
  3470. frequency: manual
  3471. cluster:
  3472. cluster_compute: multi_node_benchmark_compute_gce.yaml
  3473. - name: iter_batches_benchmark_single_node
  3474. group: data-tests
  3475. working_dir: nightly_tests/dataset
  3476. frequency: nightly
  3477. team: data
  3478. cluster:
  3479. byod:
  3480. type: gpu
  3481. cluster_compute: single_node_benchmark_compute.yaml
  3482. run:
  3483. # Expect the benchmark to finish around 12 minutes.
  3484. timeout: 1080
  3485. script: python iter_batches_benchmark.py
  3486. variations:
  3487. - __suffix__: aws
  3488. - __suffix__: gce
  3489. env: gce
  3490. frequency: manual
  3491. cluster:
  3492. cluster_compute: single_node_benchmark_compute_gce.yaml
  3493. - name: dataset_shuffle_random_shuffle_1tb
  3494. group: data-tests
  3495. working_dir: nightly_tests
  3496. frequency: nightly
  3497. team: data
  3498. cluster:
  3499. byod:
  3500. runtime_env:
  3501. - RAY_worker_killing_policy=retriable_lifo
  3502. pip:
  3503. - ray[default]
  3504. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3505. run:
  3506. timeout: 7200
  3507. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3508. wait_for_nodes:
  3509. num_nodes: 20
  3510. variations:
  3511. - __suffix__: aws
  3512. - __suffix__: gce
  3513. env: gce
  3514. frequency: manual
  3515. cluster:
  3516. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3517. - name: dataset_shuffle_sort_1tb
  3518. group: data-tests
  3519. working_dir: nightly_tests
  3520. frequency: nightly
  3521. team: data
  3522. stable: False
  3523. cluster:
  3524. byod:
  3525. runtime_env:
  3526. - RAY_worker_killing_policy=retriable_lifo
  3527. pip:
  3528. - ray[default]
  3529. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3530. run:
  3531. timeout: 7200
  3532. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3533. wait_for_nodes:
  3534. num_nodes: 20
  3535. variations:
  3536. - __suffix__: aws
  3537. - __suffix__: gce
  3538. env: gce
  3539. frequency: manual
  3540. cluster:
  3541. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3542. - name: dataset_shuffle_push_based_random_shuffle_1tb
  3543. group: data-tests
  3544. working_dir: nightly_tests
  3545. stable: false
  3546. frequency: nightly
  3547. team: data
  3548. cluster:
  3549. byod:
  3550. runtime_env:
  3551. - RAY_worker_killing_policy=retriable_lifo
  3552. pip:
  3553. - ray[default]
  3554. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3555. run:
  3556. timeout: 7200
  3557. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3558. wait_for_nodes:
  3559. num_nodes: 20
  3560. variations:
  3561. - __suffix__: aws
  3562. - __suffix__: gce
  3563. env: gce
  3564. frequency: manual
  3565. cluster:
  3566. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3567. - name: dataset_shuffle_push_based_sort_1tb
  3568. group: data-tests
  3569. working_dir: nightly_tests
  3570. frequency: nightly
  3571. team: data
  3572. stable: False
  3573. cluster:
  3574. byod:
  3575. runtime_env:
  3576. - RAY_worker_killing_policy=retriable_lifo
  3577. pip:
  3578. - ray[default]
  3579. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3580. run:
  3581. timeout: 7200
  3582. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3583. wait_for_nodes:
  3584. num_nodes: 20
  3585. variations:
  3586. - __suffix__: aws
  3587. - __suffix__: gce
  3588. env: gce
  3589. frequency: manual
  3590. cluster:
  3591. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3592. - name: dataset_shuffle_push_based_random_shuffle_100tb
  3593. group: data-tests
  3594. working_dir: nightly_tests
  3595. stable: false
  3596. frequency: weekly
  3597. team: data
  3598. cluster:
  3599. byod:
  3600. runtime_env:
  3601. - RAY_object_spilling_config={"type":"filesystem","params":{"directory_path":["/tmp/data0","/tmp/data1"]}}
  3602. post_build_script: byod_dataset_shuffle.sh
  3603. cluster_compute: shuffle/100tb_shuffle_compute.yaml
  3604. run:
  3605. timeout: 28800
  3606. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle
  3607. wait_for_nodes:
  3608. num_nodes: 100
  3609. variations:
  3610. - __suffix__: aws
  3611. - __suffix__: gce
  3612. env: gce
  3613. frequency: manual
  3614. cluster:
  3615. cluster_compute: shuffle/100tb_shuffle_compute_gce.yaml
  3616. run:
  3617. timeout: 28800
  3618. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=40000 --partition-size=1e9 --shuffle
  3619. wait_for_nodes:
  3620. num_nodes: 100
  3621. ##################
  3622. # Core Chaos tests
  3623. ##################
  3624. - name: chaos_many_tasks_kill_raylet
  3625. group: core-nightly-test
  3626. working_dir: nightly_tests
  3627. frequency: nightly
  3628. team: core
  3629. cluster:
  3630. byod: {}
  3631. cluster_compute: chaos_test/compute_template.yaml
  3632. run:
  3633. timeout: 3600
  3634. wait_for_nodes:
  3635. num_nodes: 10
  3636. prepare: python setup_chaos.py --no-start
  3637. script: python chaos_test/test_chaos_basic.py --workload=tasks
  3638. variations:
  3639. - __suffix__: aws
  3640. - __suffix__: gce
  3641. env: gce
  3642. frequency: manual
  3643. cluster:
  3644. cluster_compute: chaos_test/compute_template_gce.yaml
  3645. - name: chaos_many_tasks_terminate_instance
  3646. group: core-nightly-test
  3647. working_dir: nightly_tests
  3648. frequency: nightly
  3649. team: core
  3650. cluster:
  3651. byod: {}
  3652. cluster_compute: chaos_test/compute_template.yaml
  3653. run:
  3654. timeout: 3600
  3655. wait_for_nodes:
  3656. num_nodes: 10
  3657. prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
  3658. script: python chaos_test/test_chaos_basic.py --workload=tasks
  3659. variations:
  3660. - __suffix__: aws
  3661. - name: chaos_many_actors_kill_raylet
  3662. group: core-nightly-test
  3663. working_dir: nightly_tests
  3664. frequency: nightly
  3665. team: core
  3666. cluster:
  3667. byod: {}
  3668. cluster_compute: chaos_test/compute_template.yaml
  3669. run:
  3670. timeout: 4200
  3671. wait_for_nodes:
  3672. num_nodes: 10
  3673. prepare: python setup_chaos.py --no-start
  3674. script: python chaos_test/test_chaos_basic.py --workload=actors
  3675. variations:
  3676. - __suffix__: aws
  3677. - __suffix__: gce
  3678. env: gce
  3679. frequency: manual
  3680. cluster:
  3681. cluster_compute: chaos_test/compute_template_gce.yaml
  3682. - name: chaos_many_actors_terminate_instance
  3683. group: core-nightly-test
  3684. working_dir: nightly_tests
  3685. frequency: nightly
  3686. team: core
  3687. cluster:
  3688. byod: {}
  3689. cluster_compute: chaos_test/compute_template.yaml
  3690. run:
  3691. timeout: 4200
  3692. wait_for_nodes:
  3693. num_nodes: 10
  3694. prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
  3695. script: python chaos_test/test_chaos_basic.py --workload=actors
  3696. variations:
  3697. - __suffix__: aws
  3698. - name: chaos_dask_on_ray_large_scale_test_no_spilling
  3699. group: data-tests
  3700. working_dir: nightly_tests
  3701. frequency: nightly
  3702. team: data
  3703. cluster:
  3704. byod:
  3705. runtime_env:
  3706. - RAY_lineage_pinning_enabled=1
  3707. cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
  3708. run:
  3709. timeout: 7200
  3710. wait_for_nodes:
  3711. num_nodes: 21
  3712. prepare: python setup_chaos.py --kill-interval 100
  3713. script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
  3714. 20 --error_rate 0 --data_save_path /tmp/ray
  3715. variations:
  3716. - __suffix__: aws
  3717. - __suffix__: gce
  3718. env: gce
  3719. frequency: manual
  3720. cluster:
  3721. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  3722. - name: chaos_dask_on_ray_large_scale_test_spilling
  3723. group: data-tests
  3724. working_dir: nightly_tests
  3725. frequency: nightly
  3726. team: data
  3727. cluster:
  3728. byod:
  3729. runtime_env:
  3730. - RAY_lineage_pinning_enabled=1
  3731. cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
  3732. run:
  3733. timeout: 7200
  3734. wait_for_nodes:
  3735. num_nodes: 21
  3736. prepare: python setup_chaos.py --kill-interval 100
  3737. script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
  3738. 70 --error_rate 0 --data_save_path /tmp/ray
  3739. variations:
  3740. - __suffix__: aws
  3741. - __suffix__: gce
  3742. env: gce
  3743. frequency: manual
  3744. cluster:
  3745. cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
  3746. - name: chaos_dataset_shuffle_push_based_sort_1tb
  3747. group: data-tests
  3748. working_dir: nightly_tests
  3749. stable: false
  3750. frequency: nightly
  3751. team: data
  3752. cluster:
  3753. byod:
  3754. runtime_env:
  3755. - RAY_worker_killing_policy=retriable_lifo
  3756. pip:
  3757. - ray[default]
  3758. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3759. run:
  3760. timeout: 7200
  3761. prepare: 'python setup_chaos.py --kill-interval 1200 --max-to-kill 3'
  3762. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3763. wait_for_nodes:
  3764. num_nodes: 20
  3765. variations:
  3766. - __suffix__: aws
  3767. - __suffix__: gce
  3768. env: gce
  3769. frequency: manual
  3770. cluster:
  3771. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3772. - name: chaos_dataset_shuffle_sort_1tb
  3773. group: data-tests
  3774. working_dir: nightly_tests
  3775. stable: false
  3776. frequency: nightly
  3777. team: data
  3778. cluster:
  3779. byod:
  3780. runtime_env:
  3781. - RAY_memory_monitor_refresh_ms=0
  3782. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3783. run:
  3784. timeout: 7200
  3785. prepare: 'python setup_chaos.py --kill-interval 900 --max-to-kill 3'
  3786. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
  3787. wait_for_nodes:
  3788. num_nodes: 20
  3789. variations:
  3790. - __suffix__: aws
  3791. - __suffix__: gce
  3792. env: gce
  3793. frequency: manual
  3794. cluster:
  3795. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3796. - name: chaos_dataset_shuffle_random_shuffle_1tb
  3797. group: data-tests
  3798. working_dir: nightly_tests
  3799. stable: false
  3800. frequency: nightly
  3801. team: data
  3802. cluster:
  3803. # leave oom disabled as test is marked unstable at the moment.
  3804. byod:
  3805. runtime_env:
  3806. - RAY_memory_monitor_refresh_ms=0
  3807. pip:
  3808. - ray[default]
  3809. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3810. run:
  3811. timeout: 7200
  3812. prepare: ' python setup_chaos.py --kill-interval 600 --max-to-kill 2'
  3813. script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3814. wait_for_nodes:
  3815. num_nodes: 20
  3816. variations:
  3817. - __suffix__: aws
  3818. - __suffix__: gce
  3819. env: gce
  3820. frequency: manual
  3821. cluster:
  3822. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3823. - name: chaos_dataset_shuffle_push_based_random_shuffle_1tb
  3824. group: data-tests
  3825. working_dir: nightly_tests
  3826. stable: false
  3827. frequency: nightly
  3828. team: data
  3829. cluster:
  3830. # leave oom disabled as test is marked unstable at the moment.
  3831. byod:
  3832. runtime_env:
  3833. - RAY_memory_monitor_refresh_ms=0
  3834. cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
  3835. run:
  3836. timeout: 7200
  3837. prepare: ' python setup_chaos.py --kill-interval 600 --max-to-kill 2'
  3838. script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
  3839. wait_for_nodes:
  3840. num_nodes: 20
  3841. variations:
  3842. - __suffix__: aws
  3843. - __suffix__: gce
  3844. env: gce
  3845. frequency: manual
  3846. cluster:
  3847. cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
  3848. #####################
  3849. # Observability tests
  3850. #####################
  3851. - name: agent_stress_test
  3852. group: core-observability-test
  3853. working_dir: dashboard
  3854. frequency: nightly
  3855. team: core
  3856. cluster:
  3857. byod:
  3858. type: gpu
  3859. runtime_env:
  3860. - RAY_INTERNAL_MEM_PROFILE_COMPONENTS=dashboard_agent
  3861. post_build_script: byod_agent_stress_test.sh
  3862. cluster_compute: agent_stress_compute.yaml
  3863. run:
  3864. timeout: 14400
  3865. script: python mem_check.py --working-dir .
  3866. variations:
  3867. - __suffix__: aws
  3868. - __suffix__: gce
  3869. env: gce
  3870. frequency: manual
  3871. cluster:
  3872. cluster_compute: agent_stress_compute_gce.yaml
  3873. - name: k8s_serve_ha_test
  3874. group: k8s-test
  3875. working_dir: k8s_tests
  3876. stable: false
  3877. frequency: nightly
  3878. team: serve
  3879. cluster:
  3880. byod: {}
  3881. cluster_compute: compute_tpl.yaml
  3882. run:
  3883. timeout: 28800 # 8h
  3884. prepare: bash prepare.sh
  3885. script: python run_gcs_ft_on_k8s.py
  3886. - name: aws_cluster_launcher
  3887. group: cluster-launcher-test
  3888. working_dir: ../python/ray/autoscaler/
  3889. frequency: nightly
  3890. team: clusters
  3891. cluster:
  3892. byod: {}
  3893. cluster_compute: aws/tests/aws_compute.yaml
  3894. run:
  3895. timeout: 2400
  3896. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10
  3897. - name: aws_cluster_launcher_nightly_image
  3898. group: cluster-launcher-test
  3899. working_dir: ../python/ray/autoscaler/
  3900. frequency: nightly
  3901. team: clusters
  3902. cluster:
  3903. byod: {}
  3904. cluster_compute: aws/tests/aws_compute.yaml
  3905. run:
  3906. timeout: 2400
  3907. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override nightly
  3908. - name: aws_cluster_launcher_latest_image
  3909. group: cluster-launcher-test
  3910. working_dir: ../python/ray/autoscaler/
  3911. frequency: nightly
  3912. team: clusters
  3913. cluster:
  3914. byod: {}
  3915. cluster_compute: aws/tests/aws_compute.yaml
  3916. run:
  3917. timeout: 2400
  3918. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override latest
  3919. - name: aws_cluster_launcher_release_image
  3920. group: cluster-launcher-test
  3921. working_dir: ../python/ray/autoscaler/
  3922. frequency: manual
  3923. team: clusters
  3924. cluster:
  3925. byod: {}
  3926. cluster_compute: aws/tests/aws_compute.yaml
  3927. run:
  3928. timeout: 2400
  3929. script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override commit
  3930. - name: aws_cluster_launcher_minimal
  3931. group: cluster-launcher-test
  3932. working_dir: ../python/ray/autoscaler/
  3933. frequency: nightly
  3934. team: clusters
  3935. cluster:
  3936. byod: {}
  3937. cluster_compute: aws/tests/aws_compute.yaml
  3938. run:
  3939. timeout: 1200
  3940. script: python launch_and_verify_cluster.py aws/example-minimal.yaml
  3941. - name: aws_cluster_launcher_full
  3942. group: cluster-launcher-test
  3943. working_dir: ../python/ray/autoscaler/
  3944. frequency: nightly
  3945. team: clusters
  3946. cluster:
  3947. byod: {}
  3948. cluster_compute: aws/tests/aws_compute.yaml
  3949. run:
  3950. timeout: 3000
  3951. script: python launch_and_verify_cluster.py aws/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest
  3952. - name: gcp_cluster_launcher_minimal
  3953. group: cluster-launcher-test
  3954. working_dir: ../python/ray/autoscaler/
  3955. stable: true
  3956. env: gce
  3957. frequency: nightly
  3958. team: clusters
  3959. cluster:
  3960. byod: {}
  3961. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3962. run:
  3963. timeout: 1200
  3964. script: python launch_and_verify_cluster.py gcp/example-minimal.yaml
  3965. - name: gcp_cluster_launcher_full
  3966. group: cluster-launcher-test
  3967. working_dir: ../python/ray/autoscaler/
  3968. stable: true
  3969. env: gce
  3970. frequency: nightly
  3971. team: clusters
  3972. cluster:
  3973. byod: {}
  3974. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3975. run:
  3976. timeout: 4800
  3977. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 30
  3978. - name: gcp_cluster_launcher_latest_image
  3979. group: cluster-launcher-test
  3980. working_dir: ../python/ray/autoscaler/
  3981. stable: true
  3982. env: gce
  3983. frequency: nightly
  3984. team: clusters
  3985. cluster:
  3986. byod: {}
  3987. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  3988. run:
  3989. timeout: 3600
  3990. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest
  3991. - name: gcp_cluster_launcher_nightly_image
  3992. group: cluster-launcher-test
  3993. working_dir: ../python/ray/autoscaler/
  3994. stable: true
  3995. env: gce
  3996. frequency: nightly
  3997. team: clusters
  3998. cluster:
  3999. byod: {}
  4000. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  4001. run:
  4002. timeout: 3600
  4003. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly
  4004. - name: gcp_cluster_launcher_release_image
  4005. group: cluster-launcher-test
  4006. working_dir: ../python/ray/autoscaler/
  4007. stable: true
  4008. env: gce
  4009. frequency: manual
  4010. team: clusters
  4011. cluster:
  4012. byod: {}
  4013. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  4014. run:
  4015. timeout: 3600
  4016. script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit
  4017. - name: gcp_cluster_launcher_gpu_docker
  4018. group: cluster-launcher-test
  4019. working_dir: ../python/ray/autoscaler/
  4020. stable: true
  4021. env: gce
  4022. frequency: weekly
  4023. team: clusters
  4024. cluster:
  4025. byod: {}
  4026. cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
  4027. run:
  4028. timeout: 1200
  4029. script: python launch_and_verify_cluster.py gcp/example-gpu-docker.yaml
  4030. - name: autoscaler_aws
  4031. group: autoscaler-test
  4032. working_dir: autoscaling_tests
  4033. stable: False
  4034. frequency: nightly
  4035. team: core
  4036. cluster:
  4037. # leave oom disabled as test is marked unstable at the moment.
  4038. byod:
  4039. runtime_env:
  4040. - RAY_memory_monitor_refresh_ms=0
  4041. pip:
  4042. - ray[default]
  4043. cluster_compute: aws.yaml
  4044. run:
  4045. timeout: 1800
  4046. script: python run.py