rllib_contrib_learning_tests.yaml 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. # --------------------------
  2. # A2C
  3. # --------------------------
  4. - name: rllib_learning_tests_a2c_tf
  5. group: RLlib tests
  6. working_dir: rllib_tests
  7. stable: false
  8. frequency: nightly
  9. team: rllib
  10. cluster:
  11. byod:
  12. type: gpu
  13. post_build_script: byod_rllib_test.sh
  14. runtime_env:
  15. - RLLIB_TEST_NO_JAX_IMPORT=1
  16. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  17. cluster_compute: 1gpu_16cpus.yaml
  18. run:
  19. timeout: 18000
  20. script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=tf
  21. alert: default
  22. variations:
  23. - __suffix__: aws
  24. - __suffix__: gce
  25. env: gce
  26. frequency: manual
  27. cluster:
  28. cluster_compute: 1gpu_16cpus_gce.yaml
  29. - name: rllib_learning_tests_a2c_torch
  30. group: RLlib tests
  31. working_dir: rllib_tests
  32. stable: false
  33. frequency: nightly
  34. team: rllib
  35. cluster:
  36. byod:
  37. type: gpu
  38. post_build_script: byod_rllib_test.sh
  39. runtime_env:
  40. - RLLIB_TEST_NO_JAX_IMPORT=1
  41. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  42. cluster_compute: 1gpu_16cpus.yaml
  43. run:
  44. timeout: 18000
  45. script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=torch
  46. alert: default
  47. variations:
  48. - __suffix__: aws
  49. - __suffix__: gce
  50. env: gce
  51. frequency: manual
  52. cluster:
  53. cluster_compute: 1gpu_16cpus_gce.yaml
  54. # --------------------------
  55. # A3C
  56. # --------------------------
  57. - name: rllib_learning_tests_a3c_tf
  58. group: RLlib tests
  59. working_dir: rllib_tests
  60. stable: false
  61. frequency: nightly
  62. team: rllib
  63. cluster:
  64. byod:
  65. type: gpu
  66. post_build_script: byod_rllib_test.sh
  67. runtime_env:
  68. - RLLIB_TEST_NO_JAX_IMPORT=1
  69. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  70. cluster_compute: 32cpus.yaml
  71. run:
  72. timeout: 18000
  73. script: python learning_tests/run.py --yaml-sub-dir=a3c --framework=tf
  74. alert: default
  75. variations:
  76. - __suffix__: aws
  77. - __suffix__: gce
  78. env: gce
  79. frequency: manual
  80. cluster:
  81. cluster_compute: 32cpus_gce.yaml
  82. # --------------------------
  83. # APEX-DQN
  84. # --------------------------
  85. - name: rllib_learning_tests_apex_dqn_tf
  86. group: RLlib tests
  87. working_dir: rllib_tests
  88. # Marking as unstable since it's currently expected to fail.
  89. stable: false
  90. frequency: nightly
  91. team: rllib
  92. cluster:
  93. byod:
  94. type: gpu
  95. post_build_script: byod_rllib_test.sh
  96. runtime_env:
  97. - RLLIB_TEST_NO_JAX_IMPORT=1
  98. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  99. cluster_compute: 1gpu_24cpus.yaml
  100. run:
  101. timeout: 18000
  102. script: python learning_tests/run.py --yaml-sub-dir=apex --framework=tf
  103. alert: default
  104. variations:
  105. - __suffix__: aws
  106. - __suffix__: gce
  107. env: gce
  108. frequency: manual
  109. cluster:
  110. cluster_compute: 1gpu_24cpus_gce.yaml
  111. - name: rllib_learning_tests_apex_dqn_torch
  112. group: RLlib tests
  113. working_dir: rllib_tests
  114. stable: false
  115. frequency: nightly
  116. team: rllib
  117. cluster:
  118. byod:
  119. type: gpu
  120. post_build_script: byod_rllib_test.sh
  121. runtime_env:
  122. - RLLIB_TEST_NO_JAX_IMPORT=1
  123. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  124. cluster_compute: 1gpu_24cpus.yaml
  125. run:
  126. timeout: 18000
  127. script: python learning_tests/run.py --yaml-sub-dir=apex --framework=torch
  128. alert: default
  129. variations:
  130. - __suffix__: aws
  131. - __suffix__: gce
  132. env: gce
  133. frequency: manual
  134. cluster:
  135. cluster_compute: 1gpu_24cpus_gce.yaml
  136. # --------------------------
  137. # DDPG
  138. # --------------------------
  139. - name: rllib_learning_tests_ddpg_tf
  140. group: RLlib tests
  141. working_dir: rllib_tests
  142. frequency: nightly
  143. stable: false
  144. team: rllib
  145. cluster:
  146. byod:
  147. type: gpu
  148. post_build_script: byod_rllib_test.sh
  149. runtime_env:
  150. - RLLIB_TEST_NO_JAX_IMPORT=1
  151. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  152. cluster_compute: 1gpu_16cpus.yaml
  153. run:
  154. timeout: 18000
  155. script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=tf
  156. alert: default
  157. variations:
  158. - __suffix__: aws
  159. - __suffix__: gce
  160. env: gce
  161. frequency: manual
  162. cluster:
  163. cluster_compute: 1gpu_16cpus_gce.yaml
  164. - name: rllib_learning_tests_ddpg_torch
  165. group: RLlib tests
  166. working_dir: rllib_tests
  167. frequency: nightly
  168. stable: false
  169. team: rllib
  170. cluster:
  171. byod:
  172. type: gpu
  173. post_build_script: byod_rllib_test.sh
  174. runtime_env:
  175. - RLLIB_TEST_NO_JAX_IMPORT=1
  176. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  177. cluster_compute: 1gpu_16cpus.yaml
  178. run:
  179. timeout: 18000
  180. script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=torch
  181. alert: default
  182. variations:
  183. - __suffix__: aws
  184. - __suffix__: gce
  185. env: gce
  186. frequency: manual
  187. cluster:
  188. cluster_compute: 1gpu_16cpus_gce.yaml
  189. # --------------------------
  190. # ES
  191. # --------------------------
  192. - name: rllib_learning_tests_es_tf
  193. group: RLlib tests
  194. working_dir: rllib_tests
  195. stable: false
  196. frequency: nightly
  197. team: rllib
  198. cluster:
  199. byod:
  200. type: gpu
  201. post_build_script: byod_rllib_test.sh
  202. runtime_env:
  203. - RLLIB_TEST_NO_JAX_IMPORT=1
  204. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  205. cluster_compute: 2gpus_64cpus.yaml
  206. run:
  207. timeout: 18000
  208. script: python learning_tests/run.py --yaml-sub-dir=es --framework=tf
  209. alert: default
  210. variations:
  211. - __suffix__: aws
  212. - __suffix__: gce
  213. env: gce
  214. frequency: manual
  215. cluster:
  216. cluster_compute: 2gpus_64cpus_gce.yaml
  217. - name: rllib_learning_tests_es_torch
  218. group: RLlib tests
  219. working_dir: rllib_tests
  220. stable: false
  221. frequency: nightly
  222. team: rllib
  223. cluster:
  224. byod:
  225. type: gpu
  226. post_build_script: byod_rllib_test.sh
  227. runtime_env:
  228. - RLLIB_TEST_NO_JAX_IMPORT=1
  229. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  230. cluster_compute: 2gpus_64cpus.yaml
  231. run:
  232. timeout: 18000
  233. script: python learning_tests/run.py --yaml-sub-dir=es --framework=torch
  234. alert: default
  235. variations:
  236. - __suffix__: aws
  237. - __suffix__: gce
  238. env: gce
  239. frequency: manual
  240. cluster:
  241. cluster_compute: 2gpus_64cpus_gce.yaml
  242. # --------------------------
  243. # SlateQ
  244. # --------------------------
  245. - name: rllib_learning_tests_slateq_tf
  246. group: RLlib tests
  247. working_dir: rllib_tests
  248. stable: false
  249. frequency: nightly
  250. team: rllib
  251. cluster:
  252. byod:
  253. type: gpu
  254. post_build_script: byod_rllib_test.sh
  255. runtime_env:
  256. - RLLIB_TEST_NO_JAX_IMPORT=1
  257. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  258. cluster_compute: 1gpu_16cpus.yaml
  259. run:
  260. timeout: 18000
  261. script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=tf
  262. alert: default
  263. variations:
  264. - __suffix__: aws
  265. - __suffix__: gce
  266. env: gce
  267. frequency: manual
  268. cluster:
  269. cluster_compute: 1gpu_16cpus_gce.yaml
  270. - name: rllib_learning_tests_slateq_torch
  271. group: RLlib tests
  272. working_dir: rllib_tests
  273. # Marking as unstable since it's currently expected to fail.
  274. stable: false
  275. frequency: nightly
  276. team: rllib
  277. cluster:
  278. byod:
  279. type: gpu
  280. post_build_script: byod_rllib_test.sh
  281. runtime_env:
  282. - RLLIB_TEST_NO_JAX_IMPORT=1
  283. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  284. cluster_compute: 1gpu_16cpus.yaml
  285. run:
  286. timeout: 18000
  287. script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=torch
  288. alert: default
  289. variations:
  290. - __suffix__: aws
  291. - __suffix__: gce
  292. env: gce
  293. frequency: manual
  294. cluster:
  295. cluster_compute: 1gpu_16cpus_gce.yaml
  296. # --------------------------
  297. # TD3
  298. # --------------------------
  299. - name: rllib_learning_tests_td3_tf
  300. group: RLlib tests
  301. working_dir: rllib_tests
  302. stable: false
  303. frequency: nightly
  304. team: rllib
  305. cluster:
  306. byod:
  307. type: gpu
  308. post_build_script: byod_rllib_test.sh
  309. runtime_env:
  310. - RLLIB_TEST_NO_JAX_IMPORT=1
  311. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  312. cluster_compute: 1gpu_16cpus.yaml
  313. run:
  314. timeout: 18000
  315. script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=tf
  316. alert: default
  317. variations:
  318. - __suffix__: aws
  319. - __suffix__: gce
  320. env: gce
  321. frequency: manual
  322. cluster:
  323. cluster_compute: 1gpu_16cpus_gce.yaml
  324. - name: rllib_learning_tests_td3_torch
  325. group: RLlib tests
  326. working_dir: rllib_tests
  327. stable: false
  328. frequency: nightly
  329. team: rllib
  330. cluster:
  331. byod:
  332. type: gpu
  333. post_build_script: byod_rllib_test.sh
  334. runtime_env:
  335. - RLLIB_TEST_NO_JAX_IMPORT=1
  336. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
  337. cluster_compute: 1gpu_16cpus.yaml
  338. run:
  339. timeout: 18000
  340. script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=torch
  341. alert: default
  342. variations:
  343. - __suffix__: aws
  344. - __suffix__: gce
  345. env: gce
  346. frequency: manual
  347. cluster:
  348. cluster_compute: 1gpu_16cpus_gce.yaml