aggregate_results.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import argparse
  2. import glob
  3. import json
  4. import numpy as np
  5. import os
  6. import pandas as pd
  7. import warnings
  8. warnings.filterwarnings("ignore")
  9. from pathlib import Path
  10. from rich import print
  11. COLUMNS = [
  12. "Model",
  13. "Dataset",
  14. "Setup",
  15. "Temp.",
  16. "Top P",
  17. "Cost",
  18. "Install",
  19. "Run",
  20. "Not Generated",
  21. "Generated",
  22. "Applied",
  23. "Resolved",
  24. "Resolved IDs",
  25. "Costs Success",
  26. "Costs Failure",
  27. "Costs Overall",
  28. ]
  29. def get_folders(path):
  30. return [entry for entry in Path(path).iterdir() if entry.is_dir()]
  31. def parse_folder_name(folder_name):
  32. """
  33. Parse the folder name to get the different parts
  34. """
  35. parsed_folder = folder_name.split("__")
  36. if len(parsed_folder) == 7:
  37. parsed_folder.append("")
  38. return parsed_folder
  39. def convert_experiments_to_rows(folder_name, runs_max):
  40. """
  41. Convert each experiment to a row in the csv
  42. """
  43. rows = []
  44. directories = get_folders(folder_name)
  45. for directory in directories:
  46. folders = get_folders(directory)
  47. for folder in folders:
  48. # Skip debug folders
  49. if "debug" in folder.name:
  50. continue
  51. # Skip fine tuned models
  52. if "ft_gpt-3.5" in folder.name:
  53. continue
  54. # Skip folders without a results.json file
  55. json_file = folder / "results.json"
  56. if not json_file.exists():
  57. # print(f"No json file in {folder}")
  58. continue
  59. # Extract run attributes
  60. folder_data = parse_folder_name(folder.name)
  61. model = folder_data[0]
  62. dataset = folder_data[1]
  63. if dataset.startswith("swe-bench-dev-easy-"):
  64. dataset = dataset[len("swe-bench-dev-easy-") :]
  65. elif dataset.startswith("swe-bench-dev-"):
  66. dataset = dataset[len("swe-bench-dev-") :]
  67. setup = folder_data[2]
  68. if len(folder_data) != 8:
  69. # TODO: This might be too strict?
  70. continue
  71. temperature = float(folder_data[3][len("t-"):].strip())
  72. top_p = float(folder_data[4][len("p-"):].strip())
  73. cost = float(folder_data[5][len("c-"):].strip())
  74. install = "Y" if folder_data[6].strip() == "install-1" else "N"
  75. # Parse out run number
  76. run = folder_data[-1]
  77. if "run" not in run:
  78. continue
  79. try:
  80. if "run-" in run:
  81. run = int(run.split("run-")[-1].split("-")[0].replace("_", "").strip())
  82. else:
  83. run = int(run.split("run")[-1].split("-")[0].replace("_", "").strip())
  84. except Exception as e:
  85. print(run)
  86. raise e
  87. if runs_max is not None and run > runs_max:
  88. continue
  89. # Load results.json file
  90. with json_file.open() as file:
  91. results_data = json.load(file)
  92. report = results_data.get("report", {})
  93. # Extract resolved ids (to calculate pass@k)
  94. resolved_ids = []
  95. if "resolved" in results_data and isinstance(results_data["resolved"], list):
  96. resolved_ids = results_data["resolved"]
  97. elif "counts" in results_data and isinstance(results_data["counts"]["resolved"], list):
  98. resolved_ids = results_data["counts"]["resolved"]
  99. # Extract instance costs from trajectories
  100. costs_overall = []
  101. costs_success = []
  102. costs_failure = []
  103. for x in glob.glob(os.path.join(str(folder), "*.traj")):
  104. traj_data = json.load(open(x))
  105. if "model_stats" not in traj_data["info"]:
  106. continue
  107. run_cost = traj_data["info"]["model_stats"]["instance_cost"]
  108. inst_id = x.split("/")[-1].split(".")[0]
  109. costs_overall.append(run_cost)
  110. if inst_id in resolved_ids:
  111. costs_success.append(run_cost)
  112. else:
  113. costs_failure.append(run_cost)
  114. # Create run row, write to csv
  115. rows.append(
  116. [
  117. model,
  118. dataset,
  119. setup,
  120. temperature,
  121. top_p,
  122. cost,
  123. install,
  124. run,
  125. report.get("# Not Generated", 0),
  126. report.get("# Generated", 0),
  127. report.get("# Applied", 0),
  128. report.get("# Resolved", 0),
  129. resolved_ids,
  130. costs_success,
  131. costs_failure,
  132. costs_overall,
  133. ]
  134. )
  135. return rows
  136. def get_results_df(folder_name, runs_max):
  137. rows = convert_experiments_to_rows(folder_name, runs_max)
  138. return (
  139. pd.DataFrame(rows, columns=COLUMNS)
  140. .sort_values(by=COLUMNS[:8])
  141. )
  142. def get_results_csv(folder_name):
  143. get_results_df(folder_name).to_csv("results.csv")
  144. print("Experiment results written to results.csv")
  145. if __name__ == "__main__":
  146. parser = argparse.ArgumentParser(description="Aggregate results from experiments")
  147. parser.add_argument("--folder", type=str, help="Folder containing experiment results", default="../trajectories")
  148. parser.add_argument("--model", nargs='+', type=str, help="Model(s) to filter results by.")
  149. parser.add_argument("--dataset", nargs='+', type=str, help="Dataset to filter results by.")
  150. parser.add_argument("--setup", nargs='+', type=str, help="Setup to filter results by.")
  151. parser.add_argument("--runs_min", type=int, help="Minimum number of runs that experiment should have been run for.")
  152. parser.add_argument("--runs_max", type=int, help="Maximum number of runs taken into account")
  153. args = parser.parse_args()
  154. df = get_results_df(args.folder, args.runs_max)
  155. grouped_data = (
  156. df.groupby(COLUMNS[:7])
  157. .agg(
  158. {
  159. "Run": "count", # Count the number of runs
  160. "Not Generated": "mean",
  161. "Generated": "mean",
  162. "Applied": "mean",
  163. "Resolved": "mean",
  164. "Resolved IDs": lambda x: len(set([item for sublist in x for item in sublist])),
  165. "Costs Success": lambda x: np.mean([item for sublist in x for item in sublist]),
  166. "Costs Failure": lambda x: np.mean([item for sublist in x for item in sublist]),
  167. "Costs Overall": lambda x: np.mean([item for sublist in x for item in sublist]),
  168. }
  169. )
  170. .round(2)
  171. .reset_index()
  172. .rename(columns={"Resolved IDs": "Pass@K", "Run": "Runs"})
  173. )
  174. # Filtering
  175. if args.model:
  176. grouped_data = grouped_data[grouped_data['Model'].isin(args.model)]
  177. if args.dataset:
  178. grouped_data = grouped_data[grouped_data['Dataset'].isin(args.dataset)]
  179. if args.setup:
  180. grouped_data = grouped_data[grouped_data['Setup'].isin(args.setup)]
  181. if args.runs_min:
  182. grouped_data = grouped_data[grouped_data['Run'] >= args.runs_min]
  183. print(f"Total experiments run: {grouped_data.shape[0]}")
  184. grouped_data_sorted = grouped_data.sort_values(by=['Dataset', 'Resolved'], ascending=[True, False])
  185. pd.set_option("display.max_rows", None)
  186. grouped = grouped_data_sorted.groupby('Dataset')
  187. for name, group in grouped:
  188. print(f'\n-----------------\nDataset: {name}\n-----------------')
  189. print(group.to_string(index=False))