run_and_eval.sh 1.0 KB

1234567891011121314151617181920212223242526272829
  1. #!/bin/bash
  2. #this script runs and evaluates the agent N times.
  3. #to run:
  4. #bash run_and_eval.sh '' default_with_inclusive_edit_demo_v2 data/dev-easy/swe-bench-dev-easy-med.json 3
  5. # vars: suffix template data number of runs
  6. # define user variables
  7. suffix=${1:-''}
  8. template=$2
  9. dataset_path=$3
  10. num_runs=$4
  11. # extract filename from the dataset path
  12. dataset_name=`basename $dataset_path`
  13. for((i=1; i<=num_runs; i++)); do
  14. # command 1
  15. python run.py --model_name gpt4 --data_path $dataset_path --config_file config/configs/$template.yaml --suffix ${suffix}run${i} --temperature 0.2 --top_p 0.95 --per_instance_cost_limit 3.00 --install_environment 1
  16. # command 2
  17. python evaluation/evaluation.py \
  18. --predictions_path trajectories/$USER/gpt4__${dataset_name}__$template__t-0.20__p-0.95__c-3.00__install-1__${suffix}run${i}/all_preds.jsonl \
  19. --swe_bench_tasks $dataset_path \
  20. --log_dir ./results \
  21. --testbed ./testbed \
  22. --skip_existing \
  23. --timeout 900 \
  24. --verbose
  25. done