setup_vms.sh 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. #!/bin/bash
  2. azure_config=azure_config.json
  3. if [ ! -f ${azure_config} ]; then
  4. echo "Cannot find $azure_config"
  5. exit 1
  6. fi
  7. location=`cat ${azure_config} | jq .location | sed 's/"//g'`
  8. rg=deepspeed_rg_$location
  9. ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
  10. if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
  11. docker_ssh_port=`cat ${azure_config} | jq .docker_ssh_port`
  12. if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'; exit 1; fi
  13. username=deepspeed
  14. args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
  15. num_vms=`az vm list -g $rg | jq '. | length'`
  16. first_ip_addr=`az vm list-ip-addresses -g $rg | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
  17. num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'`
  18. echo "number of slots per vm: $num_slots"
  19. hostfile=hostfile
  20. ssh_config=config
  21. echo -n "" > $hostfile
  22. echo -n "" > $ssh_config
  23. for node_id in `seq 0 $((num_vms - 1))`; do
  24. private_ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'`
  25. echo "worker-${node_id} slots=${num_slots}" >> hostfile
  26. echo "Host worker-${node_id}
  27. HostName ${private_ip_addr}
  28. Port ${docker_ssh_port}
  29. StrictHostKeyChecking no
  30. " >> ${ssh_config}
  31. done
  32. update_script="
  33. sudo mkdir -p /job;
  34. sudo chmod -R 777 /job;
  35. mkdir -p workdir;
  36. git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed;
  37. "
  38. for node_id in `seq 0 $((num_vms - 1))`; do
  39. ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
  40. addr=${username}@${ip_addr}
  41. echo "copying ssh keys, ssh config, hostfile to worker-${node_id}"
  42. ssh $args ${addr} $update_script
  43. scp $args ${ssh_key}* ${addr}:.ssh/
  44. scp $args ${ssh_config} ${addr}:.ssh/
  45. scp $args ${hostfile} ${addr}:/job/
  46. done
  47. rm $hostfile $ssh_config