123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319 |
- from __future__ import annotations
- import http.server
- import json
- import os
- import socketserver
- from argparse import ArgumentParser
- from functools import partial
- from pathlib import Path
- from typing import Any
- import yaml
- def append_exit(content):
- last_entry = content["history"][-1]
- if last_entry["role"] == "system":
- return content
- exit_status = content.get("info", {}).get("exit_status", None)
- if exit_status is None:
- return content
- if exit_status.startswith("submitted"):
- if "submission" in content["info"]:
- submission = content["info"]["submission"]
- content["history"].append(
- {
- "role": "model_patch",
- "content": submission,
- },
- )
- # else submission should be in history already
- else:
- msg = "No submission in history or info"
- raise ValueError(msg)
- # elif content.get("info", {}).get("exit_status", None) is not None:
- # content["history"].append({
- # "role": "system",
- # "content": f"Exited - {content['info']['exit_status']}",
- # })
- return content
- def append_patch(instance_id, content, patches, patch_type):
- if content.get("info", {}).get("exit_status", None) is not None:
- if instance_id in patches:
- content["history"].append(
- {
- "role": f"{patch_type} Patch",
- "content": patches[instance_id],
- },
- )
- return content
- def append_results(traj_path: Path, instance_id: str, content, results, results_file, scorecards, scorecards_file):
- stats: list[str] = []
- model_stats = {}
- if traj_path.exists():
- data = json.loads(traj_path.read_text())
- info = data.get("info", {})
- model_stats = info.get("model_stats", {})
- instance_cost = model_stats.get("instance_cost", None)
- instance_cost = f"{instance_cost:.2f}" if instance_cost is not None else "N/A"
- tokens_sent = model_stats.get("tokens_sent", None)
- tokens_sent = f"{tokens_sent:,}" if tokens_sent is not None else "N/A"
- tokens_received = model_stats.get("tokens_received", None)
- tokens_received = f"{tokens_received:,}" if tokens_received is not None else "N/A"
- api_calls = model_stats.get("api_calls", None)
- api_calls = f"{api_calls:,}" if api_calls is not None else "N/A"
- stats.append("**** Run Stats ****")
- stats.append(f"Instance Cost: ${instance_cost}")
- stats.append(f"Tokens Sent: {tokens_sent}")
- stats.append(f"Tokens Received: {tokens_received}")
- stats.append(f"API Calls: {api_calls}\n")
- status = []
- if results is None:
- status.append("Evaluation results not found")
- elif "not_generated" in results and "generated" in results and "applied" in results and "resolved" in results:
- is_generated = instance_id in results["generated"]
- is_applied = instance_id in results["applied"]
- is_resolved = instance_id in results["resolved"]
- status.append("**** Statuses ****")
- status.append(
- f" {'✅' if is_generated else '❌'} Generated (The agent was {'' if is_generated else 'not '}"
- "able to generate a pull request to address this issue)",
- )
- status.append(
- f" {'✅' if is_applied else '❌'} Applied (The pull request was {'' if is_applied else 'not '}"
- "successfully applied to the repo during eval)",
- )
- status.append(
- f" {'✅' if is_resolved else '❌'} Resolved (The pull request {'' if is_resolved else 'not '}"
- "successfully resolved the issue during eval)",
- )
- else:
- status.append("Results format not recognized")
- if scorecards is not None:
- scorecard = [x for x in scorecards if x["instance_id"] == instance_id][0]
- if (
- "test_results" in scorecard
- and "failure" in scorecard["test_results"]
- and (
- len(scorecard["test_results"]["failure"]["FAIL_TO_PASS"]) > 0
- or len(scorecard["test_results"]["failure"]["PASS_TO_PASS"]) > 0
- )
- ):
- tests_failing = [f" - {x}" for x in scorecard["test_results"]["failure"]["FAIL_TO_PASS"]] + [
- f" - {x}" for x in scorecard["test_results"]["failure"]["PASS_TO_PASS"]
- ]
- status.extend(["", "**** Test Results ****", "🧪 Tests Failed"] + tests_failing[:7])
- if len(tests_failing) > 7:
- status.append(f" ... and {len(tests_failing) - 7} more")
- status.append("")
- if status == []:
- status.append("Instance not found in results")
- else:
- status.append("---------------------------")
- status.append(
- "Note that the evaluation results here may not be accurate or up to date, since they are computed separately from the agent run itself.",
- )
- status.append(f"Check {results_file} for the most accurate evaluation results.")
- status.append("")
- status.append(f"Instance ID: {instance_id}")
- status.append("Based on results:")
- status.append(json.dumps(results, indent=4))
- eval_report = {
- "role": "Evaluation Report",
- "content": "\n".join([*stats, *status]),
- }
- content["history"].insert(0, eval_report)
- content["history"].append(eval_report)
- return content
- def load_content(file_name, gold_patches, test_patches) -> dict[str, Any]:
- with open(file_name) as infile:
- content = json.load(infile)
- results_file = Path(file_name).parent / "results.json"
- results = load_results(results_file)
- scorecards_file = Path(file_name).parent / "scorecards.json"
- scorecards = None
- if scorecards_file.exists():
- with open(scorecards_file) as infile:
- scorecards = json.load(infile)
- content = append_exit(content) # accommodate new and old format
- content = append_patch(Path(file_name).stem, content, gold_patches, "Gold")
- content = append_patch(Path(file_name).stem, content, test_patches, "Test")
- return append_results(
- Path(file_name),
- Path(file_name).stem,
- content,
- results,
- results_file,
- scorecards,
- scorecards_file,
- )
- def load_results(results_path: Path) -> dict[str, Any] | None:
- """Load results from results.json.
- If file is not found, return None.
- """
- if not results_path.exists():
- return None
- with open(results_path) as infile:
- results = json.load(infile)
- # Different versions of the code used "not_generated" or "no_generation".
- # Let's standardize this here
- if "no_generation" in results:
- results["not_generated"] = results["no_generation"]
- del results["no_generation"]
- return results
- # TODO: shouldn't reload results fore very status
- def get_status(traj_path) -> str:
- """Return results emoji for single trajectory"""
- results = load_results(Path(traj_path).parent / "results.json")
- instance_id = Path(traj_path).stem
- if results is None:
- return "❓"
- elif "not_generated" in results and "generated" in results and "applied" in results and "resolved" in results:
- if instance_id in results["not_generated"]:
- return "❓"
- if instance_id in results["generated"]:
- if instance_id in results["resolved"]:
- return "✅"
- else:
- return "❌"
- return "❓"
- class Handler(http.server.SimpleHTTPRequestHandler):
- file_mod_times = {} # Dictionary to keep track of file modification times
- def __init__(self, *args, **kwargs):
- self.gold_patches = {}
- self.test_patches = {}
- if "gold_patches" in kwargs:
- self.gold_patches = kwargs.pop("gold_patches")
- if "test_patches" in kwargs:
- self.test_patches = kwargs.pop("test_patches")
- self.traj_dir = kwargs.pop("directory", ".") # Extract directory
- super().__init__(*args, **kwargs)
- def serve_directory_info(self):
- self.send_response(200)
- self.send_header("Content-type", "application/json")
- self.end_headers()
- self.wfile.write(json.dumps({"directory": self.traj_dir}).encode())
- def serve_file_content(self, file_path):
- try:
- content = load_content(
- Path(self.traj_dir) / file_path,
- self.gold_patches,
- self.test_patches,
- )
- self.send_response(200)
- self.send_header("Content-type", "text/plain")
- self.end_headers()
- self.wfile.write(json.dumps(content).encode())
- except FileNotFoundError:
- self.send_error(404, f"File {file_path} not found")
- def do_GET(self):
- if self.path == "/directory_info":
- self.serve_directory_info()
- elif self.path.startswith("/files"):
- self.handle_files_request()
- elif self.path.startswith("/trajectory/"):
- file_path = self.path[len("/trajectory/") :]
- self.serve_file_content(file_path)
- elif self.path.startswith("/check_update"):
- self.check_for_updates()
- else:
- super().do_GET()
- def handle_files_request(self):
- self.send_response(200)
- self.send_header("Content-type", "application/json")
- self.end_headers()
- files = sorted(
- (
- str(file.relative_to(Path(self.traj_dir))) + " " * 4 + get_status(file)
- for file in Path(self.traj_dir).glob("**/*.traj")
- ),
- key=lambda x: str(Path(self.traj_dir) / x),
- reverse=True,
- )
- self.wfile.write(json.dumps(files).encode())
- def check_for_updates(self):
- current_mod_times = {str(file): os.path.getmtime(file) for file in Path(self.traj_dir).glob("**/*.traj")}
- if current_mod_times != Handler.file_mod_times:
- Handler.file_mod_times = current_mod_times
- self.send_response(200) # Send response that there's an update
- else:
- self.send_response(204) # Send no content response if no update
- self.end_headers()
- def end_headers(self):
- self.send_header("Access-Control-Allow-Origin", "*")
- super().end_headers()
- def main(data_path, directory, port):
- data = []
- if data_path is not None:
- if data_path.endswith(".jsonl"):
- data = [json.loads(x) for x in Path(data_path).read_text().splitlines(keepends=True)]
- elif data_path.endswith(".json"):
- with open(data_path) as f:
- data = json.load(f)
- elif "args.yaml" in os.listdir(directory):
- with open(os.path.join(directory, "args.yaml")) as file:
- args = yaml.safe_load(file)
- if "environment" in args and "data_path" in args["environment"]:
- data_path = os.path.join(Path(__file__).parent, "..", args["environment"]["data_path"])
- if os.path.exists(data_path):
- with open(data_path) as f:
- data = json.load(f)
- gold_patches = {d["instance_id"]: d["patch"] if "patch" in d else None for d in data}
- test_patches = {d["instance_id"]: d["test_patch"] if "test_patch" in d else None for d in data}
- handler_with_directory = partial(
- Handler,
- directory=directory,
- gold_patches=gold_patches,
- test_patches=test_patches,
- )
- try:
- with socketserver.TCPServer(("", port), handler_with_directory) as httpd:
- print(f"Serving at http://localhost:{port}")
- httpd.serve_forever()
- except OSError as e:
- if e.errno == 48:
- print(f"ERROR: Port ({port}) is already in use. Try another port with the --port flag.")
- else:
- raise e
- if __name__ == "__main__":
- parser = ArgumentParser()
- parser.add_argument("--data_path", type=str, help="Path to dataset that was used for the trajectories")
- parser.add_argument("--directory", type=str, help="Directory to serve", default="./trajectories", nargs="?")
- parser.add_argument("--port", type=int, help="Port to serve", default=8000)
- args = parser.parse_args()
- main(**vars(args))
|