123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- #! /bin/bash
- helpFunction()
- {
- echo ""
- echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
- echo -e "\t-m model parallelism"
- echo -e "\t-g gpus per node"
- echo -e "\t-n node count"
- echo -e "\t-b batch size"
- echo -e "\t-s training steps"
- echo -e "\t-l layers"
- echo -e "\t-h hidden size"
- echo -e "\t-q sequence length"
- echo -e "\t-e attention heads"
- echo -e "\t-c checkpoint num_layers"
- echo -e "\t-o other args"
- echo -e "\t-d DeepSpeed config json file"
- echo -e "\t-z Enable Zero optimization"
- echo -e "\t-p DeepSpeed master port"
- exit 1
- }
- layers=2
- hidden_size=128
- seq_length=1024
- ckpt_num_layers=1
- other_args=""
- ds_opt=""
- zero_opt=""
- master_port=29600
- script_path=$(realpath $0)
- script_dir=$(dirname $script_path)
- while getopts "m:g:n:b:s:l:h:q:e:c:o:d:z" opt
- do
- case "$opt" in
- m ) mp="$OPTARG" ;;
- g ) gpus="$OPTARG" ;;
- n ) nodes="$OPTARG" ;;
- b ) bs="$OPTARG" ;;
- s ) steps="$OPTARG" ;;
- l ) layers="$OPTARG" ;;
- h ) hidden_size="$OPTARG" ;;
- q ) seq_length="$OPTARG" ;;
- e ) heads="$OPTARG" ;;
- c ) ckpt_num_layers="$OPTARG" ;;
- p ) master_port="$OPTARG" ;;
- o ) other_args="$OPTARG" ;;
- d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
- z ) zero_opt="--zero_optimization" ;;
- ? ) helpFunction ;;
- esac
- done
- # Print helpFunction in case parameters are empty
- if [ -z "$mp" ] || [ -z "$gpus" ] || [ -z "$nodes" ] || [ -z "$bs" ] || [ -z "$steps" ]
- then
- echo "Some or all of the parameters are empty";
- helpFunction
- fi
- # Change for multinode config
- MASTER_ADDR=localhost
- MASTER_PORT=6000
- gpt_options=" \
- --model-parallel-size ${mp} \
- --num-layers ${layers} \
- --hidden-size ${hidden_size} \
- --num-attention-heads ${heads} \
- --batch-size ${bs} \
- --seq-length ${seq_length} \
- --max-position-embeddings ${seq_length} \
- --train-iters ${steps} \
- --train-data webtext \
- --lazy-loader \
- --tokenizer-type GPT2BPETokenizer \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 0.00015 \
- --no-load-optim \
- --lr-decay-style cosine \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup .01 \
- --checkpoint-activations \
- --checkpoint-num-layers ${ckpt_num_layers} \
- --fp16 \
- --cache-dir /tmp/cache_dir \
- --log-interval 1 \
- ${other_args} \
- ${ds_opt} \
- ${zero_opt} \
- "
- work_dir="../../../DeepSpeedExamples/Megatron-LM/"
- run_cmd="(cd ${work_dir} && deepspeed --master_port ${master_port} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
- echo ${run_cmd}
- eval ${run_cmd}
- set +x
|