123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- import argparse
- import glob
- import json
- import numpy as np
- import os
- import pandas as pd
- import warnings
- warnings.filterwarnings("ignore")
- from pathlib import Path
- from rich import print
- COLUMNS = [
- "Model",
- "Dataset",
- "Setup",
- "Temp.",
- "Top P",
- "Cost",
- "Install",
- "Run",
- "Not Generated",
- "Generated",
- "Applied",
- "Resolved",
- "Resolved IDs",
- "Costs Success",
- "Costs Failure",
- "Costs Overall",
- ]
- def get_folders(path):
- return [entry for entry in Path(path).iterdir() if entry.is_dir()]
- def parse_folder_name(folder_name):
- """
- Parse the folder name to get the different parts
- """
- parsed_folder = folder_name.split("__")
- if len(parsed_folder) == 7:
- parsed_folder.append("")
- return parsed_folder
- def convert_experiments_to_rows(folder_name, runs_max):
- """
- Convert each experiment to a row in the csv
- """
- rows = []
- directories = get_folders(folder_name)
- for directory in directories:
- folders = get_folders(directory)
- for folder in folders:
- # Skip debug folders
- if "debug" in folder.name:
- continue
- # Skip fine tuned models
- if "ft_gpt-3.5" in folder.name:
- continue
- # Skip folders without a results.json file
- json_file = folder / "results.json"
- if not json_file.exists():
- # print(f"No json file in {folder}")
- continue
- # Extract run attributes
- folder_data = parse_folder_name(folder.name)
- model = folder_data[0]
- dataset = folder_data[1]
- if dataset.startswith("swe-bench-dev-easy-"):
- dataset = dataset[len("swe-bench-dev-easy-") :]
- elif dataset.startswith("swe-bench-dev-"):
- dataset = dataset[len("swe-bench-dev-") :]
- setup = folder_data[2]
- if len(folder_data) != 8:
- # TODO: This might be too strict?
- continue
- temperature = float(folder_data[3][len("t-"):].strip())
- top_p = float(folder_data[4][len("p-"):].strip())
- cost = float(folder_data[5][len("c-"):].strip())
- install = "Y" if folder_data[6].strip() == "install-1" else "N"
- # Parse out run number
- run = folder_data[-1]
- if "run" not in run:
- continue
-
- try:
- if "run-" in run:
- run = int(run.split("run-")[-1].split("-")[0].replace("_", "").strip())
- else:
- run = int(run.split("run")[-1].split("-")[0].replace("_", "").strip())
- except Exception as e:
- print(run)
- raise e
-
- if runs_max is not None and run > runs_max:
- continue
- # Load results.json file
- with json_file.open() as file:
- results_data = json.load(file)
- report = results_data.get("report", {})
- # Extract resolved ids (to calculate pass@k)
- resolved_ids = []
- if "resolved" in results_data and isinstance(results_data["resolved"], list):
- resolved_ids = results_data["resolved"]
- elif "counts" in results_data and isinstance(results_data["counts"]["resolved"], list):
- resolved_ids = results_data["counts"]["resolved"]
-
- # Extract instance costs from trajectories
- costs_overall = []
- costs_success = []
- costs_failure = []
- for x in glob.glob(os.path.join(str(folder), "*.traj")):
- traj_data = json.load(open(x))
- if "model_stats" not in traj_data["info"]:
- continue
- run_cost = traj_data["info"]["model_stats"]["instance_cost"]
- inst_id = x.split("/")[-1].split(".")[0]
- costs_overall.append(run_cost)
- if inst_id in resolved_ids:
- costs_success.append(run_cost)
- else:
- costs_failure.append(run_cost)
- # Create run row, write to csv
- rows.append(
- [
- model,
- dataset,
- setup,
- temperature,
- top_p,
- cost,
- install,
- run,
- report.get("# Not Generated", 0),
- report.get("# Generated", 0),
- report.get("# Applied", 0),
- report.get("# Resolved", 0),
- resolved_ids,
- costs_success,
- costs_failure,
- costs_overall,
- ]
- )
- return rows
- def get_results_df(folder_name, runs_max):
- rows = convert_experiments_to_rows(folder_name, runs_max)
- return (
- pd.DataFrame(rows, columns=COLUMNS)
- .sort_values(by=COLUMNS[:8])
- )
- def get_results_csv(folder_name):
- get_results_df(folder_name).to_csv("results.csv")
- print("Experiment results written to results.csv")
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Aggregate results from experiments")
- parser.add_argument("--folder", type=str, help="Folder containing experiment results", default="../trajectories")
- parser.add_argument("--model", nargs='+', type=str, help="Model(s) to filter results by.")
- parser.add_argument("--dataset", nargs='+', type=str, help="Dataset to filter results by.")
- parser.add_argument("--setup", nargs='+', type=str, help="Setup to filter results by.")
- parser.add_argument("--runs_min", type=int, help="Minimum number of runs that experiment should have been run for.")
- parser.add_argument("--runs_max", type=int, help="Maximum number of runs taken into account")
- args = parser.parse_args()
- df = get_results_df(args.folder, args.runs_max)
- grouped_data = (
- df.groupby(COLUMNS[:7])
- .agg(
- {
- "Run": "count", # Count the number of runs
- "Not Generated": "mean",
- "Generated": "mean",
- "Applied": "mean",
- "Resolved": "mean",
- "Resolved IDs": lambda x: len(set([item for sublist in x for item in sublist])),
- "Costs Success": lambda x: np.mean([item for sublist in x for item in sublist]),
- "Costs Failure": lambda x: np.mean([item for sublist in x for item in sublist]),
- "Costs Overall": lambda x: np.mean([item for sublist in x for item in sublist]),
- }
- )
- .round(2)
- .reset_index()
- .rename(columns={"Resolved IDs": "Pass@K", "Run": "Runs"})
- )
- # Filtering
- if args.model:
- grouped_data = grouped_data[grouped_data['Model'].isin(args.model)]
- if args.dataset:
- grouped_data = grouped_data[grouped_data['Dataset'].isin(args.dataset)]
- if args.setup:
- grouped_data = grouped_data[grouped_data['Setup'].isin(args.setup)]
- if args.runs_min:
- grouped_data = grouped_data[grouped_data['Run'] >= args.runs_min]
- print(f"Total experiments run: {grouped_data.shape[0]}")
- grouped_data_sorted = grouped_data.sort_values(by=['Dataset', 'Resolved'], ascending=[True, False])
- pd.set_option("display.max_rows", None)
- grouped = grouped_data_sorted.groupby('Dataset')
- for name, group in grouped:
- print(f'\n-----------------\nDataset: {name}\n-----------------')
- print(group.to_string(index=False))
|