job.yaml 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. ---
  2. apiVersion: batch/v1
  3. kind: Job
  4. metadata:
  5. name: {{ template "horovod.fullname" . }}
  6. labels:
  7. app: {{ template "horovod.name" . }}
  8. chart: {{ template "horovod.chart" . }}
  9. release: {{ .Release.Name }}
  10. heritage: {{ .Release.Service }}
  11. role: driver
  12. spec:
  13. template:
  14. metadata:
  15. labels:
  16. app: {{ template "horovod.name" . }}
  17. release: {{ .Release.Name }}
  18. role: driver
  19. spec:
  20. {{- if .Values.useHostNetwork }}
  21. hostNetwork: {{ .Values.useHostNetwork }}
  22. dnsPolicy: ClusterFirstWithHostNet
  23. {{- end }}
  24. {{- if .Values.useHostPID }}
  25. hostPID: {{ .Values.useHostPID }}
  26. {{- end }}
  27. restartPolicy: OnFailure
  28. volumes:
  29. - name: {{ template "horovod.fullname" . }}-cm
  30. configMap:
  31. name: {{ template "horovod.fullname" . }}
  32. items:
  33. - key: hostfile.config
  34. path: hostfile
  35. mode: 438
  36. - key: driver.waitWorkerReady
  37. path: waitWorkersReady.sh
  38. mode: 365
  39. - key: driver.run
  40. path: run.sh
  41. mode: 365
  42. {{- if .Values.ssh.useSecrets }}
  43. - name: {{ template "horovod.fullname" . }}-secret
  44. secret:
  45. secretName: {{ template "horovod.fullname" . }}
  46. defaultMode: 448
  47. items:
  48. - key: host-key
  49. path: id_rsa
  50. - key: host-key-pub
  51. path: authorized_keys
  52. {{- end }}
  53. {{- if .Values.volumes }}
  54. {{ toYaml .Values.volumes | indent 6 }}
  55. {{- end }}
  56. containers:
  57. - name: horovod-driver
  58. image: "{{ .Values.driver.image.repository }}:{{ .Values.driver.image.tag }}"
  59. imagePullPolicy: {{ .Values.driver.image.pullPolicy }}
  60. env:
  61. - name: SSHPORT
  62. value: "{{ .Values.ssh.port }}"
  63. {{- if .Values.ssh.useSecrets }}
  64. - name: USESECRETS
  65. value: "{{ .Values.ssh.useSecrets }}"
  66. {{- end }}
  67. {{- if .Values.driver.env }}
  68. {{- range $key, $value := .Values.driver.env }}
  69. - name: "{{ $key }}"
  70. value: "{{ $value }}"
  71. {{- end }}
  72. {{- end }}
  73. {{- if .Values.driver.privileged }}
  74. securityContext:
  75. privileged: true
  76. {{- end }}
  77. ports:
  78. - containerPort: {{ .Values.ssh.port }}
  79. volumeMounts:
  80. - name: {{ template "horovod.fullname" . }}-cm
  81. mountPath: /horovod/generated
  82. {{- if .Values.ssh.useSecrets }}
  83. - name: {{ template "horovod.fullname" . }}-secret
  84. readOnly: true
  85. mountPath: "/etc/secret-volume"
  86. {{- end }}
  87. {{- if .Values.volumeMounts }}
  88. {{ toYaml .Values.volumeMounts | indent 8 }}
  89. {{- end }}
  90. command:
  91. - /horovod/generated/run.sh
  92. args:
  93. {{ toYaml .Values.driver.args | indent 10 }}
  94. resources:
  95. {{ toYaml .Values.resources | indent 10 }}
  96. {{- if .Values.ssh.useSecrets }}
  97. initContainers:
  98. - name: wait-workers
  99. image: "{{ .Values.driver.image.repository }}:{{ .Values.driver.image.tag }}"
  100. imagePullPolicy: {{ .Values.driver.image.pullPolicy }}
  101. env:
  102. - name: SSHPORT
  103. value: "{{ .Values.ssh.port }}"
  104. {{- if .Values.ssh.useSecrets }}
  105. - name: USESECRETS
  106. value: "{{ .Values.ssh.useSecrets }}"
  107. {{- end }}
  108. {{- if .Values.driver.env }}
  109. {{- range $key, $value := .Values.driver.env }}
  110. - name: "{{ $key }}"
  111. value: "{{ $value }}"
  112. {{- end }}
  113. {{- end }}
  114. command:
  115. - /horovod/generated/waitWorkersReady.sh
  116. args:
  117. - /horovod/generated/hostfile
  118. volumeMounts:
  119. - name: {{ template "horovod.fullname" . }}-cm
  120. mountPath: /horovod/generated
  121. {{- if .Values.ssh.useSecrets }}
  122. - name: {{ template "horovod.fullname" . }}-secret
  123. readOnly: true
  124. mountPath: "/etc/secret-volume"
  125. {{- end }}
  126. {{- end }}