evaluation.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. import argparse
  2. import json
  3. import os
  4. import traceback
  5. from datasets import load_dataset, load_from_disk
  6. from collections import Counter
  7. from rich import print
  8. from swebench import (
  9. KEY_INSTANCE_ID,
  10. KEY_MODEL,
  11. KEY_PREDICTION,
  12. get_eval_report,
  13. get_logs_eval,
  14. get_model_report,
  15. get_resolution_status,
  16. run_evaluation,
  17. get_eval_refs,
  18. )
  19. from swebench.harness.constants import (
  20. INSTALL_FAIL,
  21. )
  22. from unidiff import PatchSet
  23. def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, timeout, verbose, conda_link, log_suffix, num_processes):
  24. # Check if paths exist
  25. if not os.path.exists(predictions_path):
  26. raise FileNotFoundError(f"Predictions path {predictions_path} does not exist")
  27. eval_refs = get_eval_refs(swe_bench_tasks)
  28. for k, v in eval_refs.items():
  29. eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]}
  30. # Change model_name_or_patch field to directory name for all predictions
  31. directory = os.path.dirname(predictions_path)
  32. directory_name = directory.rsplit("/", 1)[-1]
  33. pred_path_orig = predictions_path
  34. pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl")
  35. pred_total, pred_will_eval = 0, 0
  36. with open(pred_path_temp, "w") as f:
  37. for l in open(pred_path_orig, "r").readlines():
  38. pred_total += 1
  39. p = json.loads(l)
  40. # Exclude predictions w/ empty strings
  41. if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "":
  42. p[KEY_MODEL] = directory_name
  43. json.dump(p, f)
  44. f.write("\n")
  45. pred_will_eval += 1
  46. print(
  47. f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
  48. )
  49. # Run evaluation
  50. predictions_path = pred_path_temp
  51. try:
  52. print("🏃 Beginning evaluation...")
  53. run_evaluation(
  54. predictions_path=predictions_path,
  55. log_dir=log_dir,
  56. swe_bench_tasks=swe_bench_tasks,
  57. testbed=testbed,
  58. skip_existing=skip_existing,
  59. timeout=timeout,
  60. verbose=verbose,
  61. conda_link=conda_link,
  62. log_suffix=log_suffix,
  63. num_processes=num_processes
  64. )
  65. print("✅ Finished evaluation")
  66. except Exception as e:
  67. print(f"❌ Evaluation failed: {e}\n{traceback.format_exc()}")
  68. pass
  69. print("==================================")
  70. os.remove(pred_path_temp)
  71. # Get predictions, define log_dir
  72. predictions = [json.loads(l) for l in open(pred_path_orig, "r").readlines()]
  73. log_dir = os.path.join(log_dir, directory_name)
  74. print(f"Log directory for evaluation run: {log_dir}")
  75. # Iterate through predictions
  76. scorecards = []
  77. for p in predictions:
  78. scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}}
  79. # Add trajectory statistics if traj_path exists
  80. traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj")
  81. if os.path.exists(traj_path):
  82. traj_data = json.load(open(traj_path, "r"))
  83. scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"])
  84. scorecard["stats"]["traj_action_dist"] = dict(
  85. Counter(
  86. [
  87. entry["action"].strip().split()[0]
  88. if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0
  89. else None
  90. for entry in traj_data["history"]
  91. ]
  92. )
  93. )
  94. scorecard["exit_status"] = (
  95. traj_data["info"]["exit_status"]
  96. if "exit_status" in traj_data["info"]
  97. else "n/a"
  98. )
  99. # Check that a prediction was generated
  100. if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "":
  101. scorecard["statuses"].append("not_generated")
  102. scorecards.append(scorecard)
  103. continue
  104. scorecard["statuses"].append("generated")
  105. # Get log file
  106. log_path = os.path.join(
  107. log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log"
  108. )
  109. if not os.path.exists(log_path):
  110. scorecard["statuses"].append("build_failure")
  111. scorecards.append(scorecard)
  112. continue
  113. # Get evaluation logs
  114. eval_sm, found = get_logs_eval(log_path)
  115. # Check that the prediction generated
  116. if not found:
  117. scorecards.append(scorecard)
  118. continue
  119. scorecard["statuses"].append("applied")
  120. with open(log_path, "r") as f:
  121. log_contents = f.read()
  122. if INSTALL_FAIL in log_contents:
  123. scorecard["statuses"].append("install_fail")
  124. # Get resolution status
  125. report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
  126. scorecard["test_results"] = {
  127. "failure": {
  128. "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"],
  129. "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"],
  130. },
  131. "success": {
  132. "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"],
  133. "PASS_TO_PASS": report["PASS_TO_PASS"]["success"],
  134. }
  135. }
  136. resolution_status = get_resolution_status(report)
  137. scorecard["statuses"].append(resolution_status)
  138. try:
  139. diff_obj = PatchSet(p[KEY_PREDICTION])
  140. scorecard["patch_files"] = [
  141. x.path
  142. for x in diff_obj.modified_files
  143. + diff_obj.added_files
  144. + diff_obj.removed_files
  145. ]
  146. scorecard["patch_lines_add"] = sum([f.added for f in diff_obj])
  147. scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj])
  148. except Exception as e:
  149. print(f"[{p[KEY_INSTANCE_ID]}] Error parsing prediction diff: {e}")
  150. scorecard["patch_files"] = []
  151. scorecard["patch_lines_add"] = 0
  152. scorecard["patch_lines_del"] = 0
  153. scorecards.append(scorecard)
  154. # Calculate cumulative results
  155. get_ids_with_status = lambda x: [
  156. s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"]
  157. ]
  158. report = {
  159. "# Not Generated": len(get_ids_with_status("not_generated")),
  160. "# Generated": len(get_ids_with_status("generated")),
  161. "# Applied": len(get_ids_with_status("applied")),
  162. "# Resolved": len(get_ids_with_status("RESOLVED_FULL")),
  163. "# Install Fail": len(get_ids_with_status("install_fail")),
  164. }
  165. print(f"== Evaluation Report ==\n{report}")
  166. report_exits = dict(
  167. Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards])
  168. )
  169. # Save to summary, scorecard json
  170. path_scorecards = os.path.join(directory, "scorecards.json")
  171. with open(path_scorecards, "w") as f:
  172. json.dump(scorecards, fp=f, indent=2)
  173. print(f"- Wrote per-instance scorecards to {path_scorecards}")
  174. path_results = os.path.join(directory, "results.json")
  175. with open(path_results, "w") as f:
  176. json.dump(
  177. {
  178. "report": report,
  179. "report_exits": report_exits,
  180. "not_generated": get_ids_with_status("not_generated"),
  181. "generated": get_ids_with_status("generated"),
  182. "applied": get_ids_with_status("applied"),
  183. "resolved": get_ids_with_status("RESOLVED_FULL"),
  184. "install_fail": get_ids_with_status("install_fail"),
  185. },
  186. fp=f,
  187. indent=2,
  188. )
  189. print(f"- Wrote summary of run to {path_results}")
  190. # Sanity check against get_model_report
  191. report = get_model_report(
  192. directory_name, pred_path_orig, swe_bench_tasks, log_dir
  193. )
  194. by_outcome = {}
  195. by_outcome_func = lambda status: len(
  196. [
  197. instance_id
  198. for _, v in report.items()
  199. if isinstance(v, dict)
  200. for instance_id in v[status]
  201. ]
  202. )
  203. by_outcome["# Not Generated"] = by_outcome_func("none")
  204. by_outcome["# Generated"] = by_outcome_func("generated")
  205. by_outcome["# Applied"] = by_outcome_func("applied")
  206. by_outcome["# Resolved"] = by_outcome_func("resolved")
  207. by_outcome["# Install Fail"] = by_outcome_func("install_fail")
  208. print(f"Reference Report:\n{by_outcome}")
  209. if __name__ == "__main__":
  210. # Parse arguments
  211. parser = argparse.ArgumentParser()
  212. parser.add_argument(
  213. "--predictions_path",
  214. type=str,
  215. help="Path to predictions file (.jsonl)",
  216. required=True,
  217. )
  218. parser.add_argument(
  219. "--log_dir", type=str, help="Path to log directory", required=True
  220. )
  221. parser.add_argument(
  222. "--swe_bench_tasks",
  223. type=str,
  224. help="Path to SWE-bench task instances file",
  225. required=True,
  226. )
  227. parser.add_argument(
  228. "--testbed", type=str, help="Path to testbed directory", required=True
  229. )
  230. parser.add_argument(
  231. "--skip_existing", action="store_true", help="(Optional) Skip existing logs"
  232. )
  233. parser.add_argument(
  234. "--timeout",
  235. type=int,
  236. help="(Optional) Timeout in seconds (default: 900)",
  237. default=900,
  238. )
  239. parser.add_argument(
  240. "--verbose", action="store_true", help="(Optional) Verbose mode"
  241. )
  242. parser.add_argument(
  243. "--conda_link", default=None, type=str, help="(Optional) URL to conda installation to use"
  244. )
  245. parser.add_argument(
  246. "--log_suffix", default=None, type=str, help="(Optional) Log suffix"
  247. )
  248. parser.add_argument(
  249. "--num_processes", default=-1, type=int, help="Num processes"
  250. )
  251. args = parser.parse_args()
  252. main(**vars(args))