config.yaml 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. {{- $workerNum := .Values.worker.number -}}
  2. {{- $name := include "horovod.fullname" . }}
  3. {{- $slots := 1 }}
  4. {{- if index .Values.resources "nvidia.com/gpu" }}
  5. {{- $slots := index .Values.resources "nvidia.com/gpu" }}
  6. {{- end }}
  7. apiVersion: v1
  8. kind: ConfigMap
  9. metadata:
  10. name: {{ template "horovod.fullname" . }}
  11. labels:
  12. heritage: {{ .Release.Service | quote }}
  13. release: {{ .Release.Name | quote }}
  14. chart: {{ template "horovod.chart" . }}
  15. app: {{ template "horovod.fullname" . }}
  16. data:
  17. hostfile.config: |
  18. {{ $name }}-driver slots={{ $slots }}
  19. {{- range $i, $none := until (int $workerNum) }}
  20. {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }}
  21. {{- end }}
  22. ssh.readiness: |
  23. #!/bin/bash
  24. set -xev
  25. ssh localhost ls
  26. driver.run: |
  27. #!/bin/bash
  28. set -x
  29. sleep 5
  30. mkdir -p /root/.ssh
  31. rm -f /root/.ssh/config
  32. touch /root/.ssh/config
  33. if [ "$USESECRETS" == "true" ];then
  34. set +e
  35. yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
  36. yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
  37. set -e
  38. fi
  39. if [ -n "$SSHPORT" ]; then
  40. echo "Port $SSHPORT" > /root/.ssh/config
  41. sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
  42. fi
  43. echo "StrictHostKeyChecking no" >> /root/.ssh/config
  44. /usr/sbin/sshd
  45. if [ $# -eq 0 ]; then
  46. sleep infinity
  47. else
  48. bash -c "$*"
  49. fi
  50. driver.waitWorkerReady: |
  51. #!/bin/bash
  52. set -xev
  53. function updateSSHPort() {
  54. mkdir -p /root/.ssh
  55. rm -f /root/.ssh/config
  56. touch /root/.ssh/config
  57. if [ -n "$SSHPORT" ]; then
  58. echo "Port $SSHPORT" > /root/.ssh/config
  59. echo "StrictHostKeyChecking no" >> /root/.ssh/config
  60. fi
  61. }
  62. function runCheckSSH() {
  63. if [[ "$USESECRETS" == "true" ]];then
  64. set +e
  65. yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
  66. yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
  67. set -e
  68. fi
  69. for i in `cat $1 | awk '{print $(1)}'`;do
  70. if [[ "$i" != *"driver" ]];then
  71. retry 30 ssh -o ConnectTimeout=2 -q $i exit
  72. fi
  73. done
  74. }
  75. function retry()
  76. {
  77. local n=0;local try=$1
  78. local cmd="${@: 2}"
  79. [[ $# -le 1 ]] && {
  80. echo "Usage $0 <retry_number> <Command>";
  81. }
  82. set +e
  83. until [[ $n -ge $try ]]
  84. do
  85. $cmd && break || {
  86. echo "Command Fail.."
  87. ((n++))
  88. echo "retry $n :: [$cmd]"
  89. sleep 1;
  90. }
  91. done
  92. $cmd
  93. if [ $? -ne 0 ]; then
  94. exit 1
  95. fi
  96. set -e
  97. }
  98. updateSSHPort
  99. runCheckSSH $1
  100. worker.run: |
  101. #!/bin/bash
  102. set -x
  103. mkdir -p /root/.ssh
  104. rm -f /root/.ssh/config
  105. touch /root/.ssh/config
  106. if [[ "$USESECRETS" == "true" ]];then
  107. set +e
  108. yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
  109. yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
  110. set -e
  111. fi
  112. if [ -n "$SSHPORT" ]; then
  113. echo "Port $SSHPORT" > /root/.ssh/config
  114. sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
  115. fi
  116. echo "StrictHostKeyChecking no" >> /root/.ssh/config
  117. /usr/sbin/sshd -D