evaluation.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. import argparse
  2. import json
  3. import os
  4. import traceback
  5. from collections import Counter
  6. from rich import print
  7. from swebench import (
  8. KEY_INSTANCE_ID,
  9. KEY_MODEL,
  10. KEY_PREDICTION,
  11. get_eval_report,
  12. get_logs_eval,
  13. get_model_report,
  14. get_resolution_status,
  15. run_evaluation,
  16. get_eval_refs,
  17. )
  18. from swebench.harness.constants import (
  19. INSTALL_FAIL,
  20. )
  21. from unidiff import PatchSet
  22. def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, timeout, verbose, conda_link, log_suffix, num_processes):
  23. # Check if paths exist
  24. if not os.path.exists(predictions_path):
  25. raise FileNotFoundError(f"Predictions path {predictions_path} does not exist")
  26. eval_refs = get_eval_refs(swe_bench_tasks)
  27. for k, v in eval_refs.items():
  28. eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]}
  29. # Change model_name_or_patch field to directory name for all predictions
  30. directory = os.path.dirname(predictions_path)
  31. directory_name = directory.rsplit("/", 1)[-1]
  32. pred_path_orig = predictions_path
  33. pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl")
  34. pred_total, pred_will_eval = 0, 0
  35. with open(pred_path_temp, "w") as f:
  36. for l in open(pred_path_orig, "r").readlines():
  37. pred_total += 1
  38. p = json.loads(l)
  39. # Exclude predictions w/ empty strings
  40. if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "":
  41. p[KEY_MODEL] = directory_name
  42. json.dump(p, f)
  43. f.write("\n")
  44. pred_will_eval += 1
  45. print(
  46. f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
  47. )
  48. # Run evaluation
  49. predictions_path = pred_path_temp
  50. try:
  51. print("🏃 Beginning evaluation...")
  52. run_evaluation(
  53. predictions_path=predictions_path,
  54. log_dir=log_dir,
  55. swe_bench_tasks=swe_bench_tasks,
  56. testbed=testbed,
  57. skip_existing=skip_existing,
  58. timeout=timeout,
  59. verbose=verbose,
  60. conda_link=conda_link,
  61. log_suffix=log_suffix,
  62. num_processes=num_processes
  63. )
  64. print("✅ Finished evaluation")
  65. except Exception as e:
  66. print(f"❌ Evaluation failed: {e}\n{traceback.format_exc()}")
  67. print("==================================")
  68. os.remove(pred_path_temp)
  69. # Get predictions, define log_dir
  70. predictions = [json.loads(l) for l in open(pred_path_orig, "r").readlines()]
  71. log_dir = os.path.join(log_dir, directory_name)
  72. print(f"Log directory for evaluation run: {log_dir}")
  73. # Iterate through predictions
  74. scorecards = []
  75. for p in predictions:
  76. scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}}
  77. # Add trajectory statistics if traj_path exists
  78. traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj")
  79. if os.path.exists(traj_path):
  80. traj_data = json.load(open(traj_path, "r"))
  81. scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"])
  82. scorecard["stats"]["traj_action_dist"] = dict(
  83. Counter(
  84. [
  85. entry["action"].strip().split()[0]
  86. if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0
  87. else None
  88. for entry in traj_data["history"]
  89. ]
  90. )
  91. )
  92. scorecard["exit_status"] = (
  93. traj_data["info"]["exit_status"]
  94. if "exit_status" in traj_data["info"]
  95. else "n/a"
  96. )
  97. # Check that a prediction was generated
  98. if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "":
  99. scorecard["statuses"].append("not_generated")
  100. scorecards.append(scorecard)
  101. continue
  102. scorecard["statuses"].append("generated")
  103. # Get log file
  104. log_path = os.path.join(
  105. log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log"
  106. )
  107. if not os.path.exists(log_path):
  108. scorecard["statuses"].append("build_failure")
  109. scorecards.append(scorecard)
  110. continue
  111. # Get evaluation logs
  112. eval_sm, found = get_logs_eval(log_path)
  113. # Check that the prediction generated
  114. if not found:
  115. scorecards.append(scorecard)
  116. continue
  117. scorecard["statuses"].append("applied")
  118. with open(log_path, "r") as f:
  119. log_contents = f.read()
  120. if INSTALL_FAIL in log_contents:
  121. scorecard["statuses"].append("install_fail")
  122. # Get resolution status
  123. report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
  124. scorecard["test_results"] = {
  125. "failure": {
  126. "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"],
  127. "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"],
  128. },
  129. "success": {
  130. "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"],
  131. "PASS_TO_PASS": report["PASS_TO_PASS"]["success"],
  132. }
  133. }
  134. resolution_status = get_resolution_status(report)
  135. scorecard["statuses"].append(resolution_status)
  136. try:
  137. diff_obj = PatchSet(p[KEY_PREDICTION])
  138. scorecard["patch_files"] = [
  139. x.path
  140. for x in diff_obj.modified_files
  141. + diff_obj.added_files
  142. + diff_obj.removed_files
  143. ]
  144. scorecard["patch_lines_add"] = sum([f.added for f in diff_obj])
  145. scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj])
  146. except Exception as e:
  147. print(f"[{p[KEY_INSTANCE_ID]}] Error parsing prediction diff: {e}")
  148. scorecard["patch_files"] = []
  149. scorecard["patch_lines_add"] = 0
  150. scorecard["patch_lines_del"] = 0
  151. scorecards.append(scorecard)
  152. # Calculate cumulative results
  153. get_ids_with_status = lambda x: [
  154. s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"]
  155. ]
  156. report = {
  157. "# Not Generated": len(get_ids_with_status("not_generated")),
  158. "# Generated": len(get_ids_with_status("generated")),
  159. "# Applied": len(get_ids_with_status("applied")),
  160. "# Resolved": len(get_ids_with_status("RESOLVED_FULL")),
  161. "# Install Fail": len(get_ids_with_status("install_fail")),
  162. }
  163. print(f"== Evaluation Report ==\n{report}")
  164. report_exits = dict(
  165. Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards])
  166. )
  167. # Save to summary, scorecard json
  168. path_scorecards = os.path.join(directory, "scorecards.json")
  169. with open(path_scorecards, "w") as f:
  170. json.dump(scorecards, fp=f, indent=2)
  171. print(f"- Wrote per-instance scorecards to {path_scorecards}")
  172. path_results = os.path.join(directory, "results.json")
  173. with open(path_results, "w") as f:
  174. json.dump(
  175. {
  176. "report": report,
  177. "report_exits": report_exits,
  178. "not_generated": get_ids_with_status("not_generated"),
  179. "generated": get_ids_with_status("generated"),
  180. "applied": get_ids_with_status("applied"),
  181. "resolved": get_ids_with_status("RESOLVED_FULL"),
  182. "install_fail": get_ids_with_status("install_fail"),
  183. },
  184. fp=f,
  185. indent=2,
  186. )
  187. print(f"- Wrote summary of run to {path_results}")
  188. # Sanity check against get_model_report
  189. report = get_model_report(
  190. directory_name, pred_path_orig, swe_bench_tasks, log_dir
  191. )
  192. print(f"Reference Report:")
  193. for k, v in report.items():
  194. print(f"- {k}: {len(v)}")
  195. if __name__ == "__main__":
  196. # Parse arguments
  197. parser = argparse.ArgumentParser()
  198. parser.add_argument(
  199. "--predictions_path",
  200. type=str,
  201. help="Path to predictions file (.jsonl)",
  202. required=True,
  203. )
  204. parser.add_argument(
  205. "--log_dir", type=str, help="Path to log directory", required=True
  206. )
  207. parser.add_argument(
  208. "--swe_bench_tasks",
  209. type=str,
  210. help="Path to SWE-bench task instances file",
  211. required=True,
  212. )
  213. parser.add_argument(
  214. "--testbed", type=str, help="Path to testbed directory", required=True
  215. )
  216. parser.add_argument(
  217. "--skip_existing", action="store_true", help="(Optional) Skip existing logs"
  218. )
  219. parser.add_argument(
  220. "--timeout",
  221. type=int,
  222. help="(Optional) Timeout in seconds (default: 900)",
  223. default=900,
  224. )
  225. parser.add_argument(
  226. "--verbose", action="store_true", help="(Optional) Verbose mode"
  227. )
  228. parser.add_argument(
  229. "--conda_link", default=None, type=str, help="(Optional) URL to conda installation to use"
  230. )
  231. parser.add_argument(
  232. "--log_suffix", default=None, type=str, help="(Optional) Log suffix"
  233. )
  234. parser.add_argument(
  235. "--num_processes", default=-1, type=int, help="Num processes"
  236. )
  237. args = parser.parse_args()
  238. main(**vars(args))