docker-compose.test.yml 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. version: '2.3'
  2. services:
  3. test-cpu-base:
  4. build:
  5. context: .
  6. dockerfile: Dockerfile.test.cpu
  7. args:
  8. UBUNTU_VERSION: 20.04
  9. GPP_VERSION: 7
  10. MPI_KIND: None
  11. PYTHON_VERSION: 3.8
  12. TENSORFLOW_PACKAGE: tensorflow-cpu==2.12.0
  13. KERAS_PACKAGE: keras==2.12.0
  14. PYTORCH_PACKAGE: torch==2.0.0+cpu
  15. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  16. TORCHVISION_PACKAGE: torchvision==0.15.1+cpu
  17. MXNET_PACKAGE: mxnet==1.9.1
  18. PYSPARK_PACKAGE: pyspark==3.4.0
  19. SPARK_PACKAGE: spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
  20. HOROVOD_BUILD_FLAGS: HOROVOD_WITH_GLOO=1
  21. privileged: true
  22. shm_size: 8gb
  23. # our baseline first
  24. test-cpu-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  25. extends: test-cpu-base
  26. # permute MPI kinds
  27. test-cpu-mpich-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  28. extends: test-cpu-base
  29. build:
  30. args:
  31. MPI_KIND: MPICH
  32. HOROVOD_BUILD_FLAGS: HOROVOD_WITHOUT_GLOO=1
  33. test-cpu-oneccl-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  34. extends: test-cpu-base
  35. build:
  36. args:
  37. MPI_KIND: ONECCL
  38. HOROVOD_BUILD_FLAGS: HOROVOD_WITHOUT_GLOO=1
  39. test-cpu-openmpi-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  40. extends: test-cpu-base
  41. build:
  42. args:
  43. MPI_KIND: OpenMPI
  44. HOROVOD_BUILD_FLAGS: HOROVOD_WITHOUT_GLOO=1
  45. test-cpu-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  46. extends: test-cpu-base
  47. build:
  48. args:
  49. MPI_KIND: OpenMPI
  50. # run_gloo_integration expects tf1 to have Gloo mpi kind to run 'Elastic Spark * Tests'
  51. # Tensorflow 1.15.5 is only available for Python 3.7
  52. # Python 3.7 is only available on Ubuntu 18.04
  53. # torch==1.8.1 is the latest we can test in this setup
  54. # there is no mxnet-1.6.0.post0 and mxnet-1.6.0 does not work with horovod
  55. # https://github.com/apache/incubator-mxnet/issues/16193
  56. # so we test with mxnet 1.5.1
  57. test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_8_1-mxnet1_5_1_p0-pyspark3_4_0:
  58. extends: test-cpu-base
  59. build:
  60. args:
  61. # On Ubuntu 18.04 our setup.py will pull in a recent CMake and use that only to build Horovod
  62. UBUNTU_VERSION: 18.04
  63. PYTHON_VERSION: 3.7
  64. # there is no tensorflow-cpu>1.15.0, so we use tensorflow==1.15.5
  65. TENSORFLOW_PACKAGE: tensorflow==1.15.5
  66. KERAS_PACKAGE: keras==2.2.4
  67. PYTORCH_PACKAGE: torch==1.8.1+cpu
  68. TORCHVISION_PACKAGE: torchvision==0.9.1+cpu
  69. MXNET_PACKAGE: mxnet==1.5.1.post0
  70. test-cpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_7_0_p2-pyspark3_4_0:
  71. extends: test-cpu-base
  72. build:
  73. args:
  74. TENSORFLOW_PACKAGE: tensorflow-cpu==2.10.1
  75. KERAS_PACKAGE: keras==2.10.0
  76. PYTORCH_PACKAGE: torch==1.12.1+cpu
  77. TORCHVISION_PACKAGE: torchvision==0.13.1+cpu
  78. MXNET_PACKAGE: mxnet==1.7.0.post2
  79. test-cpu-gloo-py3_8-tf2_11_1-keras2_11_0-torch1_13_1-mxnet1_8_0_p0-pyspark3_4_0:
  80. extends: test-cpu-base
  81. build:
  82. args:
  83. TENSORFLOW_PACKAGE: tensorflow-cpu==2.11.1
  84. KERAS_PACKAGE: keras==2.11.0
  85. PYTORCH_PACKAGE: torch==1.13.1+cpu
  86. TORCHVISION_PACKAGE: torchvision==0.14.1+cpu
  87. MXNET_PACKAGE: mxnet==1.8.0.post0
  88. # then our baseline again, omitted ...
  89. test-cpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_4_0:
  90. extends: test-cpu-base
  91. build:
  92. args:
  93. MPI_KIND: OpenMPI
  94. TENSORFLOW_PACKAGE: tf-nightly
  95. KERAS_PACKAGE: None
  96. PYTORCH_PACKAGE: torch-nightly
  97. TORCHVISION_PACKAGE: torchvision
  98. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  99. MXNET_PACKAGE: mxnet-nightly
  100. # these are the lowest framework versions that Horovod compiles with, but they are not tested
  101. test-cpu-openmpi-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:
  102. extends: test-cpu-base
  103. build:
  104. args:
  105. UBUNTU_VERSION: 18.04
  106. PYTHON_VERSION: 3.7
  107. MPI_KIND: OpenMPI
  108. TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
  109. KERAS_PACKAGE: keras==2.2.4
  110. PYTORCH_PACKAGE: torch==1.5.0+cpu
  111. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==0.7.3
  112. TORCHVISION_PACKAGE: torchvision==0.6.0+cpu
  113. MXNET_PACKAGE: mxnet==1.4.1
  114. PYSPARK_PACKAGE: pyspark==2.4.0
  115. SPARK_PACKAGE: spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
  116. # we deviate from baseline here because PySpark 2.4 requires Python 3.7 and
  117. # Tensorflow 2.11.0 is the last version that supports that Python
  118. # Torch 1.13.1 is the last version that supports that Python
  119. test-cpu-gloo-py3_7-tf2_11_0-keras2_11_0-torch1_13_1-mxnet1_9_1-pyspark2_4_8:
  120. extends: test-cpu-base
  121. build:
  122. args:
  123. # PySpark 2.4.8 is only available for Python 3.7
  124. # Python 3.7 is only available on Ubuntu 18.04
  125. # Tensorflow 2.11.0 is the last version supporting that Python
  126. # Torch 1.13.1 is the last version supporting that Python
  127. UBUNTU_VERSION: 18.04
  128. PYTHON_VERSION: 3.7
  129. TENSORFLOW_PACKAGE: tensorflow-cpu==2.11.0
  130. KERAS_PACKAGE: keras==2.11.0
  131. PYTORCH_PACKAGE: torch==1.13.1+cpu
  132. TORCHVISION_PACKAGE: torchvision==0.14.1+cpu
  133. PYSPARK_PACKAGE: pyspark==2.4.8
  134. SPARK_PACKAGE: spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
  135. test-cpu-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_3_2:
  136. extends: test-cpu-base
  137. build:
  138. args:
  139. PYTHON_VERSION: 3.8
  140. PYSPARK_PACKAGE: pyspark==3.3.2
  141. SPARK_PACKAGE: spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz
  142. # then our baseline again, omitted ...
  143. test-gpu-base:
  144. build:
  145. context: .
  146. dockerfile: Dockerfile.test.gpu
  147. args:
  148. GPP_VERSION: 7
  149. MPI_KIND: None
  150. PYTHON_VERSION: 3.8
  151. PYSPARK_PACKAGE: pyspark==3.4.0
  152. SPARK_PACKAGE: spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
  153. HOROVOD_BUILD_FLAGS: HOROVOD_GPU_OPERATIONS=NCCL
  154. HOROVOD_MIXED_INSTALL: 0
  155. runtime: nvidia
  156. # We plumb CUDA_VISIBLE_DEVICES instead of NVIDIA_VISIBLE_DEVICES because
  157. # the latter does not work in privileged mode that we use in the containers.
  158. environment:
  159. - CUDA_VISIBLE_DEVICES
  160. privileged: true
  161. shm_size: 8gb
  162. # available versions for CUDNN_VERSION and NCCL_VERSION_OVERRIDE can be found at
  163. # https://developer.download.nvidia.com/compute/cuda/repos/{OS}/x86_64/
  164. # Mainline tensorflow-gpu==1.15.5 is compiled against and linked to CUDA 10.0, but appropriate containers aren't
  165. # available anymore. Hence, we use the updated Python 3.8 wheel provided by Nvidia, see
  166. # https://github.com/NVIDIA/tensorflow. For this reason versions of torch and mxnet also deviate from the CPU path.
  167. test-gpu-gloo-py3_8-tf1_15_5-keras2_2_4-torch1_12_1-mxnet1_8_0_p0-pyspark3_4_0:
  168. extends: test-gpu-base
  169. build:
  170. args:
  171. CUDA_DOCKER_VERSION: 11.6.2-devel-ubuntu20.04
  172. CUDNN_VERSION: 8.4.1.50-1+cuda11.6
  173. NCCL_VERSION_OVERRIDE: 2.11.4-1+cuda11.6
  174. PYTHON_VERSION: 3.8
  175. TENSORFLOW_PACKAGE: nvidia-tensorflow==1.15.5+nv22.4
  176. KERAS_PACKAGE: keras==2.2.4
  177. PYTORCH_PACKAGE: torch==1.12.1+cu116
  178. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  179. TORCHVISION_PACKAGE: torchvision==0.13.1+cu116
  180. MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
  181. # The container isn't provided for CUDA 10 anymore. The lowest version of mxnet available for cu112 is 1.8.0.post0.
  182. test-gpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_8_0_p0-pyspark3_4_0:
  183. extends: test-gpu-base
  184. build:
  185. args:
  186. CUDA_DOCKER_VERSION: 11.6.2-devel-ubuntu20.04
  187. CUDNN_VERSION: 8.4.1.50-1+cuda11.6
  188. NCCL_VERSION_OVERRIDE: 2.11.4-1+cuda11.6
  189. TENSORFLOW_PACKAGE: tensorflow-gpu==2.10.1
  190. KERAS_PACKAGE: keras==2.10.0
  191. PYTORCH_PACKAGE: torch==1.12.1+cu116
  192. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  193. TORCHVISION_PACKAGE: torchvision==0.13.1+cu116
  194. MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
  195. test-gpu-gloo-py3_8-tf2_11_1-keras2_11_0-torch1_13_1-mxnet1_8_0_p0-pyspark3_4_0:
  196. extends: test-gpu-base
  197. build:
  198. args:
  199. CUDA_DOCKER_VERSION: 11.6.2-devel-ubuntu20.04
  200. CUDNN_VERSION: 8.4.1.50-1+cuda11.6
  201. NCCL_VERSION_OVERRIDE: 2.11.4-1+cuda11.6
  202. # tensorflow package supports GPU from 2.11.1 and 2.12.0 on
  203. TENSORFLOW_PACKAGE: tensorflow==2.11.1
  204. KERAS_PACKAGE: keras==2.11.0
  205. PYTORCH_PACKAGE: torch==1.13.1+cu116
  206. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  207. TORCHVISION_PACKAGE: torchvision==0.14.1+cu116
  208. MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
  209. test-gpu-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  210. extends: test-gpu-base
  211. build:
  212. args:
  213. CUDA_DOCKER_VERSION: 11.8.0-devel-ubuntu20.04
  214. CUDNN_VERSION: 8.6.0.163-1+cuda11.8
  215. NCCL_VERSION_OVERRIDE: 2.16.5-1+cuda11.8
  216. MPI_KIND: OpenMPI
  217. # tensorflow package supports GPU from 2.11.1 and 2.12.0 on
  218. TENSORFLOW_PACKAGE: tensorflow==2.12.0
  219. KERAS_PACKAGE: keras==2.12.0
  220. PYTORCH_PACKAGE: torch==2.0.0+cu118
  221. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  222. TORCHVISION_PACKAGE: torchvision==0.15.1+cu118
  223. MXNET_PACKAGE: mxnet-cu112==1.9.1
  224. test-gpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_4_0:
  225. extends: test-gpu-base
  226. build:
  227. args:
  228. CUDA_DOCKER_VERSION: 11.8.0-devel-ubuntu20.04
  229. CUDNN_VERSION: 8.6.0.163-1+cuda11.8
  230. NCCL_VERSION_OVERRIDE: 2.16.5-1+cuda11.8
  231. MPI_KIND: OpenMPI
  232. TENSORFLOW_PACKAGE: tf-nightly
  233. KERAS_PACKAGE: None
  234. PYTORCH_PACKAGE: torch-nightly-cu118
  235. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  236. TORCHVISION_PACKAGE: torchvision
  237. MXNET_PACKAGE: mxnet-nightly-cu112
  238. # These are the lowest framework versions that Horovod compiles with on the CUDA 11.x container, but they are not tested.
  239. # Versions of python, mxnet, and pyspark differ from the CPU build with minimum versions.
  240. test-gpu-openmpi-gloo-py3_8-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:
  241. extends: test-gpu-base
  242. build:
  243. args:
  244. CUDA_DOCKER_VERSION: 11.6.2-devel-ubuntu20.04
  245. CUDNN_VERSION: 8.4.1.50-1+cuda11.6
  246. NCCL_VERSION_OVERRIDE: 2.11.4-1+cuda11.6
  247. MPI_KIND: OpenMPI
  248. PYTHON_VERSION: 3.8
  249. TENSORFLOW_PACKAGE: nvidia-tensorflow==1.15.5+nv22.4
  250. KERAS_PACKAGE: keras==2.2.4
  251. # torch ships its own CUDA libraries
  252. PYTORCH_PACKAGE: torch==1.5.0+cu101
  253. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==0.7.3
  254. TORCHVISION_PACKAGE: torchvision==0.6.0+cu101
  255. MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
  256. # On Python 3.8 Spark 3.0.0 is the lowest supported version
  257. PYSPARK_PACKAGE: pyspark==3.0.0
  258. SPARK_PACKAGE: spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
  259. test-mixed-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0:
  260. extends: test-gpu-base
  261. build:
  262. args:
  263. CUDA_DOCKER_VERSION: 11.8.0-devel-ubuntu20.04
  264. CUDNN_VERSION: 8.6.0.163-1+cuda11.8
  265. NCCL_VERSION_OVERRIDE: 2.16.5-1+cuda11.8
  266. MPI_KIND: OpenMPI
  267. # tensorflow package supports GPU from 2.11.1 and 2.12.0 on
  268. TENSORFLOW_PACKAGE: tensorflow==2.12.0
  269. KERAS_PACKAGE: keras==2.12.0
  270. PYTORCH_PACKAGE: torch==2.0.0+cu118
  271. PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.5.9
  272. TORCHVISION_PACKAGE: torchvision==0.15.1+cu118
  273. MXNET_PACKAGE: mxnet-cu112==1.9.1
  274. HOROVOD_BUILD_FLAGS: ""
  275. HOROVOD_MIXED_INSTALL: 1