azure-init.sh 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #!/bin/sh
  2. USERNAME=$1
  3. CONDA_ENV=$2
  4. WHEEL=$3
  5. RAY_HEAD_IP=$4
  6. TYPE=$5
  7. echo "Installing wheel..."
  8. sudo -u "$USERNAME" -i /bin/bash -l -c "conda init bash"
  9. sudo -u "$USERNAME" -i /bin/bash -l -c "conda activate $CONDA_ENV; pip install $WHEEL"
  10. echo "Setting up service scripts..."
  11. cat > /home/"$USERNAME"/ray-head.sh << EOM
  12. #!/bin/bash
  13. eval "$(conda shell.bash hook)"
  14. conda activate $CONDA_ENV
  15. NUM_GPUS=\`nvidia-smi -L | wc -l\`
  16. ray stop
  17. ulimit -n 65536
  18. ray start --head --port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --dashboard-host 0.0.0.0
  19. EOM
  20. cat > /home/"$USERNAME"/ray-worker.sh << EOM
  21. #!/bin/bash
  22. eval "$(conda shell.bash hook)"
  23. conda activate $CONDA_ENV
  24. NUM_GPUS=\`nvidia-smi -L | wc -l\`
  25. ray stop
  26. ulimit -n 65536
  27. while true
  28. do
  29. ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block
  30. echo Ray exited. Auto-restarting in 1 second...
  31. sleep 1
  32. done
  33. EOM
  34. cat > /home/"$USERNAME"/tensorboard.sh << EOM
  35. #!/bin/bash
  36. eval "$(conda shell.bash hook)"
  37. conda activate $CONDA_ENV
  38. mkdir -p /home/$USERNAME/ray_results
  39. tensorboard --bind_all --logdir=/home/$USERNAME/ray_results
  40. EOM
  41. chmod +x /home/"$USERNAME"/ray-head.sh
  42. chmod +x /home/"$USERNAME"/ray-worker.sh
  43. chmod +x /home/"$USERNAME"/tensorboard.sh
  44. cat > /lib/systemd/system/ray.service << EOM
  45. [Unit]
  46. Description=Ray
  47. [Service]
  48. Type=simple
  49. User=$USERNAME
  50. ExecStart=/bin/bash -l /home/$USERNAME/ray-$TYPE.sh
  51. [Install]
  52. WantedBy=multi-user.target
  53. EOM
  54. cat > /lib/systemd/system/tensorboard.service << EOM
  55. [Unit]
  56. Description=TensorBoard
  57. [Service]
  58. Type=simple
  59. User=$USERNAME
  60. ExecStart=/bin/bash -l /home/$USERNAME/tensorboard.sh
  61. [Install]
  62. WantedBy=multi-user.target
  63. EOM
  64. echo "Configure ray to start at boot..."
  65. systemctl enable ray
  66. echo "Starting ray..."
  67. systemctl start ray
  68. # shellcheck disable=SC2154
  69. if [ "$type" = "head" ]; then
  70. echo "Configure TensorBoard to start at boot..."
  71. systemctl enable tensorboard
  72. echo "Starting TensorBoard..."
  73. systemctl start tensorboard
  74. fi