ds_gpt2_test.sh 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #! /bin/bash
  2. helpFunction()
  3. {
  4. echo ""
  5. echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s steps -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
  6. echo -e "\t-m model parallelism"
  7. echo -e "\t-g gpus per node"
  8. echo -e "\t-n node count"
  9. echo -e "\t-b batch size"
  10. echo -e "\t-s training steps"
  11. echo -e "\t-l layers"
  12. echo -e "\t-h hidden size"
  13. echo -e "\t-q sequence length"
  14. echo -e "\t-e attention heads"
  15. echo -e "\t-c checkpoint num_layers"
  16. echo -e "\t-o other args"
  17. echo -e "\t-d DeepSpeed config json file"
  18. echo -e "\t-z Enable Zero optimization"
  19. echo -e "\t-p DeepSpeed master port"
  20. exit 1
  21. }
  22. layers=2
  23. hidden_size=128
  24. seq_length=1024
  25. ckpt_num_layers=1
  26. other_args=""
  27. ds_opt=""
  28. zero_opt=""
  29. master_port=29600
  30. script_path=$(realpath $0)
  31. script_dir=$(dirname $script_path)
  32. while getopts "m:g:n:b:s:l:h:q:e:c:o:d:z" opt
  33. do
  34. case "$opt" in
  35. m ) mp="$OPTARG" ;;
  36. g ) gpus="$OPTARG" ;;
  37. n ) nodes="$OPTARG" ;;
  38. b ) bs="$OPTARG" ;;
  39. s ) steps="$OPTARG" ;;
  40. l ) layers="$OPTARG" ;;
  41. h ) hidden_size="$OPTARG" ;;
  42. q ) seq_length="$OPTARG" ;;
  43. e ) heads="$OPTARG" ;;
  44. c ) ckpt_num_layers="$OPTARG" ;;
  45. p ) master_port="$OPTARG" ;;
  46. o ) other_args="$OPTARG" ;;
  47. d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
  48. z ) zero_opt="--zero_optimization" ;;
  49. ? ) helpFunction ;;
  50. esac
  51. done
  52. # Print helpFunction in case parameters are empty
  53. if [ -z "$mp" ] || [ -z "$gpus" ] || [ -z "$nodes" ] || [ -z "$bs" ] || [ -z "$steps" ]
  54. then
  55. echo "Some or all of the parameters are empty";
  56. helpFunction
  57. fi
  58. # Change for multinode config
  59. MASTER_ADDR=localhost
  60. MASTER_PORT=6000
  61. gpt_options=" \
  62. --model-parallel-size ${mp} \
  63. --num-layers ${layers} \
  64. --hidden-size ${hidden_size} \
  65. --num-attention-heads ${heads} \
  66. --batch-size ${bs} \
  67. --seq-length ${seq_length} \
  68. --max-position-embeddings ${seq_length} \
  69. --train-iters ${steps} \
  70. --train-data webtext \
  71. --lazy-loader \
  72. --tokenizer-type GPT2BPETokenizer \
  73. --split 949,50,1 \
  74. --distributed-backend nccl \
  75. --lr 0.00015 \
  76. --no-load-optim \
  77. --lr-decay-style cosine \
  78. --weight-decay 1e-2 \
  79. --clip-grad 1.0 \
  80. --warmup .01 \
  81. --checkpoint-activations \
  82. --checkpoint-num-layers ${ckpt_num_layers} \
  83. --fp16 \
  84. --cache-dir /tmp/cache_dir \
  85. --log-interval 1 \
  86. ${other_args} \
  87. ${ds_opt} \
  88. ${zero_opt} \
  89. "
  90. work_dir="../../../DeepSpeedExamples/Megatron-LM/"
  91. run_cmd="(cd ${work_dir} && deepspeed --master_port ${master_port} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
  92. echo ${run_cmd}
  93. eval ${run_cmd}
  94. set +x