123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168 |
- #!/bin/bash
- usage() {
- echo """
- Usage: $0 [defined arguments...] [other arguments...]
- [defined]
- -g, --num_gpus num gpus per node to use
- -h, --help this help text
- -n, --num_nodes num nodes to use
- -e, --epochs num of training epochs
- -b, --batch_size training batch size
- -p, --port master port for nccl
- [other arguments]
- all undefined arguments will be passed to the user's application
- """
- }
- validate_folder() {
- dir=$1
- dir_name=$2
- if [[ -d ${dir} ]]; then
- echo "Using ${dir_name}: ${dir}"
- else
- echo "${dir} folder not found"
- exit 1
- fi
- }
- remove_folder() {
- dir=$1
- dir_name=$2
- if [[ -d ${dir} ]]; then
- echo "The variable ${dir_name} is set to ${dir} which already exists, so removing and creating a fresh one"
- rm -rvf ${dir}
- fi
- }
- num_nodes=1
- num_gpus=8
- epochs=2
- batch_size=24
- enable_deepspeed=false
- master_port=$((20000+RANDOM%5000))
- LR=3e-5
- while [[ $# -gt 0 ]]
- do
- key="$1"
- case $key in
- -g|--num_gpus)
- num_gpus="$2"
- shift
- shift
- ;;
- -n|--num_nodes)
- num_nodes="$2"
- shift
- shift
- ;;
- -e|--epochs)
- epochs="$2"
- shift
- shift
- ;;
- -b|--batch_size)
- batch_size="$2"
- shift
- shift
- ;;
- -p|--master_port)
- master_port="$2"
- shift
- shift
- ;;
- -d|--deepspeed)
- enable_deepspeed=true
- shift
- ;;
- -h|--help)
- usage
- exit 0
- ;;
- *) # other arguments
- other_args="${other_args} $1"
- shift
- ;;
- esac
- done
- # Validate path to BingBertSquad script
- if [ -z "${BingBertSquad_DIR+x}" ]; then
- export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
- echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
- fi
- validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
- # Validate path to processed Squad data
- if [ -z "${SQUAD_DIR+x}" ]; then
- export SQUAD_DIR=/data/BingBertSquad
- echo "SQUAD_DIR environment variable not set; trying default: ${SQUAD_DIR}"
- fi
- validate_folder ${SQUAD_DIR} "SQUAD_DIR"
- # Set output path
- if [ -z "${OUTPUT_DIR+x}" ]; then
- export OUTPUT_DIR=/tmp/BingBertSquad-Output
- echo "OUTPUT_DIR environment variable not set; trying default: ${OUTPUT_DIR}"
- fi
- remove_folder ${OUTPUT_DIR} "OUTPUT_DIR"
- echo "num_nodes: ${num_nodes}"
- echo "num_gpus: ${num_gpus}"
- echo "epochs: ${epochs}"
- echo "batch_size: ${batch_size}"
- echo "master_port: ${master_port}"
- echo "deepspeed: ${enable_deepspeed}"
- echo "other_args: ${other_args}"
- EFFECTIVE_BATCH_SIZE=${batch_size}
- MAX_GPU_BATCH_SIZE=3
- PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
- if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
- GRAD_ACCUM_STEPS=1
- else
- GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
- fi
- if [[ ${enable_deepspeed} == true ]]; then
- BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_deepspeed.py
- else
- BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_baseline.py
- fi
- JOB_NAME="BingBertSquad_ds-${enable_deepspeed}_${num_gpus}-gpu"
- # --do_predict \
- squad_args="--bert_model bert-large-uncased \
- --do_train \
- --do_lower_case \
- --train_file ${SQUAD_DIR}/train-v1.1.json \
- --predict_file ${SQUAD_DIR}/dev-v1.1.json \
- --train_batch_size ${PER_GPU_BATCH_SIZE} \
- --learning_rate ${LR} \
- --num_train_epochs ${epochs} \
- --max_seq_length 384 \
- --doc_stride 128 \
- --output_dir ${OUTPUT_DIR} \
- --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
- --job_name ${JOB_NAME} \
- --model_file ${SQUAD_DIR}/training_state_checkpoint_162.tar
- "
- run_cmd="deepspeed.pt \
- --num_nodes ${num_nodes} \
- --num_gpus ${num_gpus} \
- --master_port ${master_port}
- ${BingBertSquad_script} ${other_args} ${squad_args}"
- echo ${run_cmd}
- eval ${run_cmd}
- set +x
- #python ${BingBertSquad_DIR}/evaluate-v1.1.py ${SQUAD_DIR}/dev-v1.1.json ${OUTPUT_DIR}/predictions.json > ${OUTPUT_DIR}/CorrectnessScores.txt
|