run_BingBertSquad.sh 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #!/bin/bash
  2. usage() {
  3. echo """
  4. Usage: $0 [defined arguments...] [other arguments...]
  5. [defined]
  6. -g, --num_gpus num gpus per node to use
  7. -h, --help this help text
  8. -n, --num_nodes num nodes to use
  9. -e, --epochs num of training epochs
  10. -b, --batch_size training batch size
  11. -p, --port master port for nccl
  12. [other arguments]
  13. all undefined arguments will be passed to the user's application
  14. """
  15. }
  16. validate_folder() {
  17. dir=$1
  18. dir_name=$2
  19. if [[ -d ${dir} ]]; then
  20. echo "Using ${dir_name}: ${dir}"
  21. else
  22. echo "${dir} folder not found"
  23. exit 1
  24. fi
  25. }
  26. remove_folder() {
  27. dir=$1
  28. dir_name=$2
  29. if [[ -d ${dir} ]]; then
  30. echo "The variable ${dir_name} is set to ${dir} which already exists, so removing and creating a fresh one"
  31. rm -rvf ${dir}
  32. fi
  33. }
  34. num_nodes=1
  35. num_gpus=8
  36. epochs=2
  37. batch_size=24
  38. enable_deepspeed=false
  39. master_port=$((20000+RANDOM%5000))
  40. LR=3e-5
  41. while [[ $# -gt 0 ]]
  42. do
  43. key="$1"
  44. case $key in
  45. -g|--num_gpus)
  46. num_gpus="$2"
  47. shift
  48. shift
  49. ;;
  50. -n|--num_nodes)
  51. num_nodes="$2"
  52. shift
  53. shift
  54. ;;
  55. -e|--epochs)
  56. epochs="$2"
  57. shift
  58. shift
  59. ;;
  60. -b|--batch_size)
  61. batch_size="$2"
  62. shift
  63. shift
  64. ;;
  65. -p|--master_port)
  66. master_port="$2"
  67. shift
  68. shift
  69. ;;
  70. -d|--deepspeed)
  71. enable_deepspeed=true
  72. shift
  73. ;;
  74. -h|--help)
  75. usage
  76. exit 0
  77. ;;
  78. *) # other arguments
  79. other_args="${other_args} $1"
  80. shift
  81. ;;
  82. esac
  83. done
  84. # Validate path to BingBertSquad script
  85. if [ -z "${BingBertSquad_DIR+x}" ]; then
  86. export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
  87. echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
  88. fi
  89. validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
  90. # Validate path to processed Squad data
  91. if [ -z "${SQUAD_DIR+x}" ]; then
  92. export SQUAD_DIR=/data/BingBertSquad
  93. echo "SQUAD_DIR environment variable not set; trying default: ${SQUAD_DIR}"
  94. fi
  95. validate_folder ${SQUAD_DIR} "SQUAD_DIR"
  96. # Set output path
  97. if [ -z "${OUTPUT_DIR+x}" ]; then
  98. export OUTPUT_DIR=/tmp/BingBertSquad-Output
  99. echo "OUTPUT_DIR environment variable not set; trying default: ${OUTPUT_DIR}"
  100. fi
  101. remove_folder ${OUTPUT_DIR} "OUTPUT_DIR"
  102. echo "num_nodes: ${num_nodes}"
  103. echo "num_gpus: ${num_gpus}"
  104. echo "epochs: ${epochs}"
  105. echo "batch_size: ${batch_size}"
  106. echo "master_port: ${master_port}"
  107. echo "deepspeed: ${enable_deepspeed}"
  108. echo "other_args: ${other_args}"
  109. EFFECTIVE_BATCH_SIZE=${batch_size}
  110. MAX_GPU_BATCH_SIZE=3
  111. PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
  112. if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
  113. GRAD_ACCUM_STEPS=1
  114. else
  115. GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
  116. fi
  117. if [[ ${enable_deepspeed} == true ]]; then
  118. BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_deepspeed.py
  119. else
  120. BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_baseline.py
  121. fi
  122. JOB_NAME="BingBertSquad_ds-${enable_deepspeed}_${num_gpus}-gpu"
  123. # --do_predict \
  124. squad_args="--bert_model bert-large-uncased \
  125. --do_train \
  126. --do_lower_case \
  127. --train_file ${SQUAD_DIR}/train-v1.1.json \
  128. --predict_file ${SQUAD_DIR}/dev-v1.1.json \
  129. --train_batch_size ${PER_GPU_BATCH_SIZE} \
  130. --learning_rate ${LR} \
  131. --num_train_epochs ${epochs} \
  132. --max_seq_length 384 \
  133. --doc_stride 128 \
  134. --output_dir ${OUTPUT_DIR} \
  135. --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
  136. --job_name ${JOB_NAME} \
  137. --model_file ${SQUAD_DIR}/training_state_checkpoint_162.tar
  138. "
  139. run_cmd="deepspeed.pt \
  140. --num_nodes ${num_nodes} \
  141. --num_gpus ${num_gpus} \
  142. --master_port ${master_port}
  143. ${BingBertSquad_script} ${other_args} ${squad_args}"
  144. echo ${run_cmd}
  145. eval ${run_cmd}
  146. set +x
  147. #python ${BingBertSquad_DIR}/evaluate-v1.1.py ${SQUAD_DIR}/dev-v1.1.json ${OUTPUT_DIR}/predictions.json > ${OUTPUT_DIR}/CorrectnessScores.txt