install.sh 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/bin/bash
  2. set -e
  3. err_report() {
  4. echo "Error on line $1"
  5. echo "Fail to install deepspeed"
  6. }
  7. trap 'err_report $LINENO' ERR
  8. usage() {
  9. echo """
  10. Usage: install.sh [options...]
  11. By default will install deepspeed and all third party dependecies accross all machines listed in
  12. hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
  13. [optional]
  14. -d, --deepspeed_only Install only deepspeed and no third party dependencies
  15. -t, --third_party_only Install only third party dependencies and not deepspeed
  16. -l, --local_only Install only on local machine
  17. -s, --pip_sudo Run pip with sudo (default: no sudo)
  18. -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
  19. -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
  20. -a, --apex_commit Install a specific commit hash of apex, instead of the one deepspeed points to
  21. -k, --skip_requirements Skip installing DeepSpeed requirements
  22. -h, --help This help text
  23. """
  24. }
  25. ds_only=0
  26. tp_only=0
  27. deepspeed_install=1
  28. third_party_install=1
  29. local_only=0
  30. pip_sudo=0
  31. entire_dlts_job=1
  32. hostfile=/job/hostfile
  33. pip_mirror=""
  34. apex_commit=""
  35. skip_requirements=0
  36. while [[ $# -gt 0 ]]
  37. do
  38. key="$1"
  39. case $key in
  40. -d|--deepspeed_only)
  41. deepspeed_install=1;
  42. third_party_install=0;
  43. ds_only=1;
  44. shift
  45. ;;
  46. -t|--third_party_only)
  47. deepspeed_install=0;
  48. third_party_install=1;
  49. tp_only=1;
  50. shift
  51. ;;
  52. -l|--local_only)
  53. local_only=1;
  54. shift
  55. ;;
  56. -s|--pip_sudo)
  57. pip_sudo=1;
  58. shift
  59. ;;
  60. -m|--pip_mirror)
  61. pip_mirror=$2;
  62. shift
  63. shift
  64. ;;
  65. -a|--apex_commit)
  66. apex_commit=$2;
  67. shift
  68. shift
  69. ;;
  70. -k|--skip_requirements)
  71. skip_requirements=1;
  72. shift
  73. ;;
  74. -H|--hostfile)
  75. hostfile=$2
  76. if [ ! -f $2 ]; then
  77. echo "User provided hostfile does not exist at $hostfile, exiting"
  78. exit 1
  79. fi
  80. shift
  81. shift
  82. ;;
  83. -h|--help)
  84. usage
  85. exit 0
  86. ;;
  87. *)
  88. echo "Unkown argument(s)"
  89. usage
  90. exit 1
  91. shift
  92. ;;
  93. esac
  94. done
  95. if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then
  96. echo "-d and -t are mutually exclusive, only choose one or none"
  97. usage
  98. exit 1
  99. fi
  100. echo "Updating git hash/branch info"
  101. echo "git_hash = '$(git rev-parse --short HEAD)'" > deepspeed/git_version_info.py
  102. echo "git_branch = '$(git rev-parse --abbrev-ref HEAD)'" >> deepspeed/git_version_info.py
  103. cat deepspeed/git_version_info.py
  104. if [ "$pip_sudo" == "1" ]; then
  105. PIP_SUDO="sudo -H"
  106. else
  107. PIP_SUDO=""
  108. fi
  109. if [ "$pip_mirror" != "" ]; then
  110. PIP_INSTALL="pip install -i $pip_mirror"
  111. else
  112. PIP_INSTALL="pip install"
  113. fi
  114. if [ ! -f $hostfile ]; then
  115. echo "No hostfile exists at $hostfile, installing locally"
  116. local_only=1
  117. fi
  118. if [ "$skip_requirements" == "0" ]; then
  119. # Ensure dependencies are installed locally
  120. $PIP_SUDO $PIP_INSTALL -r requirements.txt
  121. fi
  122. # Build wheels
  123. if [ "$third_party_install" == "1" ]; then
  124. echo "Checking out sub-module(s)"
  125. git submodule update --init --recursive
  126. echo "Building apex wheel"
  127. cd third_party/apex
  128. if [ "$apex_commit" != "" ]; then
  129. echo "Installing a non-standard version of apex at commit: $apex_commit"
  130. git fetch
  131. git checkout $apex_commit
  132. fi
  133. python setup.py --cpp_ext --cuda_ext bdist_wheel
  134. cd -
  135. echo "Installing apex locally so that deepspeed will build"
  136. $PIP_SUDO pip uninstall -y apex
  137. $PIP_SUDO $PIP_INSTALL third_party/apex/dist/apex*.whl
  138. fi
  139. if [ "$deepspeed_install" == "1" ]; then
  140. echo "Building deepspeed wheel"
  141. python setup.py bdist_wheel
  142. fi
  143. if [ "$local_only" == "1" ]; then
  144. if [ "$deepspeed_install" == "1" ]; then
  145. echo "Installing deepspeed"
  146. $PIP_SUDO pip uninstall -y deepspeed
  147. $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
  148. python basic_install_test.py
  149. if [ $? == 0 ]; then
  150. echo "Installation is successful"
  151. else
  152. echo "Installation failed"
  153. fi
  154. fi
  155. else
  156. local_path=`pwd`
  157. if [ -f $hostfile ]; then
  158. hosts=`cat $hostfile | awk '{print $1}' | paste -sd "," -`;
  159. else
  160. echo "hostfile not found, cannot proceed"
  161. exit 1
  162. fi
  163. export PDSH_RCMD_TYPE=ssh;
  164. tmp_wheel_path="/tmp/deepspeed_wheels"
  165. pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
  166. pdcp -w $hosts requirements.txt ${tmp_wheel_path}/
  167. if [ "$skip_requirements" == "0" ]; then
  168. pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
  169. fi
  170. if [ "$third_party_install" == "1" ]; then
  171. pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
  172. pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
  173. pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/apex*.whl"
  174. pdsh -w $hosts 'python -c "import apex"'
  175. fi
  176. if [ "$deepspeed_install" == "1" ]; then
  177. echo "Installing deepspeed"
  178. pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
  179. pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
  180. pdcp -w $hosts basic_install_test.py $tmp_wheel_path/
  181. pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
  182. pdsh -w $hosts "python $tmp_wheel_path/basic_install_test.py"
  183. echo "Installation is successful"
  184. fi
  185. pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl $tmp_wheel_path/basic_install_test.py $tmp_wheel_path/requirements.txt; rmdir $tmp_wheel_path; fi"
  186. fi