123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- {{- $workerNum := .Values.worker.number -}}
- {{- $name := include "horovod.fullname" . }}
- {{- $slots := 1 }}
- {{- if index .Values.resources "nvidia.com/gpu" }}
- {{- $slots := index .Values.resources "nvidia.com/gpu" }}
- {{- end }}
- apiVersion: v1
- kind: ConfigMap
- metadata:
- name: {{ template "horovod.fullname" . }}
- labels:
- heritage: {{ .Release.Service | quote }}
- release: {{ .Release.Name | quote }}
- chart: {{ template "horovod.chart" . }}
- app: {{ template "horovod.fullname" . }}
- data:
- hostfile.config: |
- {{ $name }}-driver slots={{ $slots }}
- {{- range $i, $none := until (int $workerNum) }}
- {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }}
- {{- end }}
- ssh.readiness: |
- #!/bin/bash
- set -xev
- ssh localhost ls
- driver.run: |
- #!/bin/bash
- set -x
- sleep 5
- mkdir -p /root/.ssh
- rm -f /root/.ssh/config
- touch /root/.ssh/config
- if [ "$USESECRETS" == "true" ];then
- set +e
- yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
- yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
- set -e
- fi
- if [ -n "$SSHPORT" ]; then
- echo "Port $SSHPORT" > /root/.ssh/config
- sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
- fi
- echo "StrictHostKeyChecking no" >> /root/.ssh/config
- /usr/sbin/sshd
- if [ $# -eq 0 ]; then
- sleep infinity
- else
- bash -c "$*"
- fi
- driver.waitWorkerReady: |
- #!/bin/bash
- set -xev
- function updateSSHPort() {
- mkdir -p /root/.ssh
- rm -f /root/.ssh/config
- touch /root/.ssh/config
- if [ -n "$SSHPORT" ]; then
- echo "Port $SSHPORT" > /root/.ssh/config
- echo "StrictHostKeyChecking no" >> /root/.ssh/config
- fi
- }
- function runCheckSSH() {
- if [[ "$USESECRETS" == "true" ]];then
- set +e
- yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
- yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
- set -e
- fi
- for i in `cat $1 | awk '{print $(1)}'`;do
- if [[ "$i" != *"driver" ]];then
- retry 30 ssh -o ConnectTimeout=2 -q $i exit
- fi
- done
- }
- function retry()
- {
- local n=0;local try=$1
- local cmd="${@: 2}"
- [[ $# -le 1 ]] && {
- echo "Usage $0 <retry_number> <Command>";
- }
- set +e
- until [[ $n -ge $try ]]
- do
- $cmd && break || {
- echo "Command Fail.."
- ((n++))
- echo "retry $n :: [$cmd]"
- sleep 1;
- }
- done
- $cmd
- if [ $? -ne 0 ]; then
- exit 1
- fi
- set -e
- }
- updateSSHPort
- runCheckSSH $1
- worker.run: |
- #!/bin/bash
- set -x
- mkdir -p /root/.ssh
- rm -f /root/.ssh/config
- touch /root/.ssh/config
- if [[ "$USESECRETS" == "true" ]];then
- set +e
- yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
- yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
- set -e
- fi
- if [ -n "$SSHPORT" ]; then
- echo "Port $SSHPORT" > /root/.ssh/config
- sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
- fi
- echo "StrictHostKeyChecking no" >> /root/.ssh/config
- /usr/sbin/sshd -D
|