repro-ci.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685
  1. """Create an AWS instance to reproduce Buildkite CI builds.
  2. This script will take a Buildkite build URL as an argument and create
  3. an AWS instance with the same properties running the same Docker container
  4. as the original Buildkite runner. The user is then attached to this instance
  5. and can reproduce any builds commands as if they were executed within the
  6. runner.
  7. This utility can be used to reproduce and debug build failures that come up
  8. on the Bildkite runner instances but not on a local machine.
  9. Optionally, build commands can be executed automatically. Filters can be added
  10. to exclude some of these commands. For instance, some users may want to execute
  11. all build commands except for the `bazel build` commands, which they would
  12. like to execute manually.
  13. Usage:
  14. python repro-ci.py [-n instance-name] [-c] [-f filter1] [-f filter2] ...
  15. Arguments:
  16. -n: Instance name to be used. If an instance with this name already exists,
  17. it will be reused.
  18. -c: Execute commands after setting up the machine.
  19. -f: Filter these commands (do not execute commands that match this
  20. regex pattern).
  21. """
  22. import base64
  23. import json
  24. import logging
  25. import os
  26. import random
  27. import re
  28. import shlex
  29. import subprocess
  30. import threading
  31. import time
  32. from numbers import Number
  33. from typing import Any, Dict, List, Optional, Callable
  34. import boto3
  35. import click
  36. import paramiko
  37. import yaml
  38. from pybuildkite.buildkite import Buildkite
  39. def maybe_fetch_buildkite_token():
  40. if os.environ.get("BUILDKITE_TOKEN", None) is None:
  41. print("Missing BUILDKITE_TOKEN, retrieving from AWS secrets store")
  42. os.environ["BUILDKITE_TOKEN"] = boto3.client(
  43. "secretsmanager", region_name="us-west-2"
  44. ).get_secret_value(
  45. SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
  46. "buildkite/ro-token"
  47. )[
  48. "SecretString"
  49. ]
  50. def escape(v: Any):
  51. if isinstance(v, bool):
  52. return f"{int(v)}"
  53. elif isinstance(v, Number):
  54. return str(v)
  55. elif isinstance(v, list):
  56. return " ".join(shlex.quote(w) for w in v)
  57. else:
  58. return v
  59. def env_str(env: Dict[str, Any]):
  60. kvs = []
  61. for k, v in env.items():
  62. if isinstance(v, bool):
  63. kvs.append((k, int(v)))
  64. elif isinstance(v, Number):
  65. kvs.append((k, str(v)))
  66. elif isinstance(v, list):
  67. for i, w in enumerate(v):
  68. kvs.append((f"{k}_{i}", w))
  69. else:
  70. kvs.append((k, v))
  71. return " ".join(f"{k}={shlex.quote(v)}" for k, v in kvs)
  72. def script_str(v: Any):
  73. if isinstance(v, bool):
  74. return f'"{int(v)}"'
  75. elif isinstance(v, Number):
  76. return f'"{v}"'
  77. elif isinstance(v, list):
  78. return "(" + " ".join(f'"{shlex.quote(w)}"' for w in v) + ")"
  79. else:
  80. return f'"{shlex.quote(v)}"'
  81. class ReproSession:
  82. plugin_default_env = {
  83. "docker": {"BUILDKITE_PLUGIN_DOCKER_MOUNT_BUILDKITE_AGENT": False}
  84. }
  85. def __init__(
  86. self,
  87. buildkite_token: str,
  88. instance_name: Optional[str] = None,
  89. logger: Optional[logging.Logger] = None,
  90. ):
  91. self.logger = logger or logging.getLogger(self.__class__.__name__)
  92. self.bk = Buildkite()
  93. self.bk.set_access_token(buildkite_token)
  94. self.ssh_user = "ec2-user"
  95. self.ssh_key_name = "buildkite-repro-env"
  96. self.ssh_key_file = "~/.ssh/buildkite-repro-env.pem"
  97. self.ec2_client = boto3.client("ec2", region_name="us-west-2")
  98. self.ec2_resource = boto3.resource("ec2", region_name="us-west-2")
  99. self.org = None
  100. self.pipeline = None
  101. self.build_id = None
  102. self.job_id = None
  103. self.env: Dict[str, str] = {}
  104. self.aws_instance_name = instance_name
  105. self.aws_instance_id = None
  106. self.aws_instance_ip = None
  107. self.ssh = None
  108. self.plugins = {}
  109. self.skipped_commands = []
  110. def set_session(self, session_url: str):
  111. # E.g.:
  112. # https://buildkite.com/ray-project/ray-builders-pr/
  113. # builds/19635#55a0d71a-831e-4f68-b668-2b10c6f65ee6
  114. pattern = re.compile(
  115. "https://buildkite.com/([^/]+)/([^/]+)/builds/([0-9]+)#(.+)"
  116. )
  117. org, pipeline, build_id, job_id = pattern.match(session_url).groups()
  118. self.logger.debug(
  119. f"Parsed session URL: {session_url}. "
  120. f"Got org='{org}', pipeline='{pipeline}', "
  121. f"build_id='{build_id}', job_id='{job_id}'."
  122. )
  123. self.org = org
  124. self.pipeline = pipeline
  125. self.build_id = build_id
  126. self.job_id = job_id
  127. def fetch_env_variables(self, overwrite: Optional[Dict[str, Any]] = None):
  128. assert self.bk
  129. self.env = self.bk.jobs().get_job_environment_variables(
  130. self.org, self.pipeline, self.build_id, self.job_id
  131. )["env"]
  132. if overwrite:
  133. self.env.update(overwrite)
  134. return self.env
  135. def aws_start_instance(self):
  136. assert self.env
  137. if not self.aws_instance_name:
  138. self.aws_instance_name = f"repro_ci_{self.build_id}_{self.job_id[:8]}"
  139. self.logger.info(
  140. f"No instance name provided, using {self.aws_instance_name}"
  141. )
  142. instance_type = self.env["BUILDKITE_AGENT_META_DATA_AWS_INSTANCE_TYPE"]
  143. instance_ami = self.env["BUILDKITE_AGENT_META_DATA_AWS_AMI_ID"]
  144. instance_sg = "sg-0ccfca2ef191c04ae"
  145. instance_block_device_mappings = [
  146. {"DeviceName": "/dev/xvda", "Ebs": {"VolumeSize": 500}}
  147. ]
  148. # Check if instance exists:
  149. running_instances = self.ec2_resource.instances.filter(
  150. Filters=[
  151. {"Name": "tag:Name", "Values": [self.aws_instance_name]},
  152. {"Name": "instance-state-name", "Values": ["running"]},
  153. ]
  154. )
  155. self.logger.info(
  156. f"Check if instance with name {self.aws_instance_name} "
  157. f"already exists..."
  158. )
  159. for instance in running_instances:
  160. self.aws_instance_id = instance.id
  161. self.aws_instance_ip = instance.public_ip_address
  162. self.logger.info(f"Found running instance {self.aws_instance_id}.")
  163. return
  164. self.logger.info(
  165. f"Instance with name {self.aws_instance_name} not found, " f"creating..."
  166. )
  167. # Else, not running, yet, start.
  168. instance = self.ec2_resource.create_instances(
  169. BlockDeviceMappings=instance_block_device_mappings,
  170. ImageId=instance_ami,
  171. InstanceType=instance_type,
  172. KeyName=self.ssh_key_name,
  173. SecurityGroupIds=[instance_sg],
  174. TagSpecifications=[
  175. {
  176. "ResourceType": "instance",
  177. "Tags": [{"Key": "Name", "Value": self.aws_instance_name}],
  178. }
  179. ],
  180. MinCount=1,
  181. MaxCount=1,
  182. )[0]
  183. self.aws_instance_id = instance.id
  184. self.logger.info(f"Created new instance with ID {self.aws_instance_id}")
  185. def aws_wait_for_instance(self):
  186. assert self.aws_instance_id
  187. self.logger.info("Waiting for instance to come up...")
  188. repro_instance_state = None
  189. while repro_instance_state != "running":
  190. detail = self.ec2_client.describe_instances(
  191. InstanceIds=[self.aws_instance_id],
  192. )
  193. repro_instance_state = detail["Reservations"][0]["Instances"][0]["State"][
  194. "Name"
  195. ]
  196. if repro_instance_state != "running":
  197. time.sleep(2)
  198. self.aws_instance_ip = detail["Reservations"][0]["Instances"][0][
  199. "PublicIpAddress"
  200. ]
  201. def aws_stop_instance(self):
  202. assert self.aws_instance_id
  203. self.ec2_client.terminate_instances(
  204. InstanceIds=[self.aws_instance_id],
  205. )
  206. def print_stop_command(self):
  207. click.secho("To stop this instance in the future, run this: ")
  208. click.secho(
  209. f"aws ec2 terminate-instances " f"--instance-ids={self.aws_instance_id}",
  210. bold=True,
  211. )
  212. def create_new_ssh_client(self):
  213. assert self.aws_instance_ip
  214. if self.ssh:
  215. self.ssh.close()
  216. self.logger.info(
  217. "Creating SSH client and waiting for SSH to become available..."
  218. )
  219. ssh = paramiko.client.SSHClient()
  220. ssh.load_system_host_keys()
  221. ssh.set_missing_host_key_policy(paramiko.WarningPolicy())
  222. timeout = time.monotonic() + 60
  223. while time.monotonic() < timeout:
  224. try:
  225. ssh.connect(
  226. self.aws_instance_ip,
  227. username=self.ssh_user,
  228. key_filename=os.path.expanduser(self.ssh_key_file),
  229. )
  230. break
  231. except paramiko.ssh_exception.NoValidConnectionsError:
  232. self.logger.info("SSH not ready, yet, sleeping 5 seconds")
  233. time.sleep(5)
  234. self.ssh = ssh
  235. return self.ssh
  236. def close_ssh(self):
  237. self.ssh.close()
  238. def ssh_exec(self, command, quiet: bool = False, *args, **kwargs):
  239. result = {}
  240. def exec():
  241. stdin, stdout, stderr = self.ssh.exec_command(command, get_pty=True)
  242. output = ""
  243. for line in stdout.readlines():
  244. output += line
  245. if not quiet:
  246. print(line, end="")
  247. for line in stderr.readlines():
  248. if not quiet:
  249. print(line, end="")
  250. result["output"] = output
  251. thread = threading.Thread(target=exec)
  252. thread.start()
  253. status = time.monotonic() + 30
  254. while thread.is_alive():
  255. thread.join(2)
  256. if time.monotonic() >= status and thread.is_alive():
  257. self.logger.info("Still executing...")
  258. status = time.monotonic() + 30
  259. thread.join()
  260. return result.get("output", "")
  261. def execute_ssh_command(
  262. self,
  263. command: str,
  264. env: Optional[Dict[str, str]] = None,
  265. as_script: bool = False,
  266. quiet: bool = False,
  267. command_wrapper: Optional[Callable[[str], str]] = None,
  268. ) -> str:
  269. assert self.ssh
  270. if not command_wrapper:
  271. def command_wrapper(s):
  272. return s
  273. full_env = self.env.copy()
  274. if env:
  275. full_env.update(env)
  276. if as_script:
  277. ftp = self.ssh.open_sftp()
  278. file = ftp.file("/tmp/script.sh", "w", -1)
  279. file.write("#!/bin/bash\n")
  280. for k, v in env.items():
  281. file.write(f"{k}={script_str(v)}\n")
  282. file.write(command + "\n")
  283. file.flush()
  284. ftp.close()
  285. full_command = "bash /tmp/script.sh"
  286. else:
  287. full_command = f"export {env_str(full_env)}; {command}"
  288. full_command = command_wrapper(full_command)
  289. self.logger.debug(f"Executing command: {command}")
  290. output = self.ssh_exec(full_command, quiet=quiet, get_pty=True)
  291. return output
  292. def execute_ssh_commands(
  293. self,
  294. commands: List[str],
  295. env: Optional[Dict[str, str]] = None,
  296. quiet: bool = False,
  297. ):
  298. for command in commands:
  299. self.execute_ssh_command(command, env=env, quiet=quiet)
  300. def execute_docker_command(
  301. self, command: str, env: Optional[Dict[str, str]] = None, quiet: bool = False
  302. ):
  303. def command_wrapper(s):
  304. escaped = s.replace("'", "'\"'\"'")
  305. return f"docker exec -it ray_container /bin/bash -ci '{escaped}'"
  306. self.execute_ssh_command(
  307. command, env=env, quiet=quiet, command_wrapper=command_wrapper
  308. )
  309. def prepare_instance(self):
  310. self.create_new_ssh_client()
  311. output = self.execute_ssh_command("docker ps", quiet=True)
  312. if "CONTAINER ID" in output:
  313. self.logger.info("Instance already prepared.")
  314. return
  315. self.logger.info("Preparing instance (installing docker etc.)")
  316. commands = [
  317. "sudo yum install -y docker",
  318. "sudo service docker start",
  319. f"sudo usermod -aG docker {self.ssh_user}",
  320. ]
  321. self.execute_ssh_commands(commands, quiet=True)
  322. self.create_new_ssh_client()
  323. self.execute_ssh_command("docker ps", quiet=True)
  324. self.docker_login()
  325. def docker_login(self):
  326. self.logger.info("Logging into docker...")
  327. credentials = boto3.client(
  328. "ecr", region_name="us-west-2"
  329. ).get_authorization_token()
  330. token = (
  331. base64.b64decode(credentials["authorizationData"][0]["authorizationToken"])
  332. .decode("utf-8")
  333. .replace("AWS:", "")
  334. )
  335. endpoint = credentials["authorizationData"][0]["proxyEndpoint"]
  336. self.execute_ssh_command(
  337. f"docker login -u AWS -p {token} {endpoint}", quiet=True
  338. )
  339. def fetch_buildkite_plugins(self):
  340. assert self.env
  341. self.logger.info("Fetching Buildkite plugins")
  342. plugins = json.loads(self.env["BUILDKITE_PLUGINS"])
  343. for collection in plugins:
  344. for plugin, options in collection.items():
  345. plugin_url, plugin_version = plugin.split("#")
  346. if not plugin_url.startswith("http://") or not plugin_url.startswith(
  347. "https://"
  348. ):
  349. plugin_url = f"https://{plugin_url}"
  350. plugin_name = plugin_url.split("/")[-1].rstrip(".git")
  351. plugin_short = plugin_name.replace("-buildkite-plugin", "")
  352. plugin_dir = f"~/{plugin_name}"
  353. plugin_env = self.get_plugin_env(plugin_short, options)
  354. self.plugins[plugin_short] = {
  355. "name": plugin_name,
  356. "options": options,
  357. "short": plugin_short,
  358. "url": plugin_url,
  359. "version": plugin_version,
  360. "dir": plugin_dir,
  361. "env": plugin_env,
  362. "details": {},
  363. }
  364. def get_plugin_env(self, plugin_short: str, options: Dict[str, Any]):
  365. plugin_env = {}
  366. for option, value in options.items():
  367. option_name = option.replace("-", "_").upper()
  368. env_name = f"BUILDKITE_PLUGIN_{plugin_short.upper()}_{option_name}"
  369. plugin_env[env_name] = value
  370. plugin_env.update(self.plugin_default_env.get(plugin_short, {}))
  371. return plugin_env
  372. def install_buildkite_plugin(self, plugin: str):
  373. assert plugin in self.plugins
  374. self.logger.info(f"Installing Buildkite plugin: {plugin}")
  375. plugin_dir = self.plugins[plugin]["dir"]
  376. plugin_url = self.plugins[plugin]["url"]
  377. plugin_version = self.plugins[plugin]["version"]
  378. self.execute_ssh_command(
  379. f"[ ! -e {plugin_dir} ] && git clone --depth 1 "
  380. f"--branch {plugin_version} {plugin_url} {plugin_dir}",
  381. quiet=True,
  382. )
  383. def load_plugin_details(self, plugin: str):
  384. assert plugin in self.plugins
  385. plugin_dir = self.plugins[plugin]["dir"]
  386. yaml_str = self.execute_ssh_command(f"cat {plugin_dir}/plugin.yml", quiet=True)
  387. details = yaml.safe_load(yaml_str)
  388. self.plugins[plugin]["details"] = details
  389. return details
  390. def execute_plugin_hook(
  391. self,
  392. plugin: str,
  393. hook: str,
  394. env: Optional[Dict[str, Any]] = None,
  395. script_command: Optional[str] = None,
  396. ):
  397. assert plugin in self.plugins
  398. self.logger.info(
  399. f"Executing Buildkite hook for plugin {plugin}: {hook}. "
  400. f"This pulls a Docker image and could take a while."
  401. )
  402. plugin_dir = self.plugins[plugin]["dir"]
  403. plugin_env = self.plugins[plugin]["env"].copy()
  404. if env:
  405. plugin_env.update(env)
  406. script_command = script_command or "bash -l"
  407. hook_script = f"{plugin_dir}/hooks/{hook}"
  408. self.execute_ssh_command(
  409. f"[ -f {hook_script} ] && cat {hook_script} | {script_command} ",
  410. env=plugin_env,
  411. as_script=False,
  412. quiet=True,
  413. )
  414. def print_buildkite_command(self, skipped: bool = False):
  415. print("-" * 80)
  416. print(
  417. "These are the commands you need to execute to fully reproduce " "the run"
  418. )
  419. print("-" * 80)
  420. print(self.env["BUILDKITE_COMMAND"])
  421. print("-" * 80)
  422. if skipped and self.skipped_commands:
  423. print(
  424. "Some of the commands above have already been run. "
  425. "Remaining commands:"
  426. )
  427. print("-" * 80)
  428. print("\n".join(self.skipped_commands))
  429. print("-" * 80)
  430. def run_buildkite_command(self, command_filter: Optional[List[str]] = None):
  431. commands = self.env["BUILDKITE_COMMAND"].split("\n")
  432. regexes = [re.compile(cf) for cf in command_filter or []]
  433. skipped_commands = []
  434. for command in commands:
  435. if any(rx.search(command) for rx in regexes):
  436. self.logger.info(f"Filtered build command: {command}")
  437. skipped_commands.append(command)
  438. continue
  439. self.logger.info(f"Executing build command: {command}")
  440. self.execute_docker_command(command)
  441. self.skipped_commands = skipped_commands
  442. def transfer_env_to_container(self):
  443. escaped = env_str(self.env).replace("'", "'\"'\"'")
  444. self.execute_docker_command(
  445. f"grep -q 'source ~/.env' $HOME/.bashrc "
  446. f"|| echo 'source ~/.env' >> $HOME/.bashrc; "
  447. f"echo 'export {escaped}' > $HOME/.env",
  448. quiet=True,
  449. )
  450. def attach_to_container(self):
  451. self.logger.info("Attaching to AWS instance...")
  452. ssh_command = (
  453. f"ssh -ti {self.ssh_key_file} "
  454. f"-o StrictHostKeyChecking=no "
  455. f"-o ServerAliveInterval=30 "
  456. f"{self.ssh_user}@{self.aws_instance_ip} "
  457. f"'docker exec -it ray_container bash -l'"
  458. )
  459. subprocess.run(ssh_command, shell=True)
  460. @click.command()
  461. @click.argument("session_url", required=False)
  462. @click.option("-n", "--instance-name", default=None)
  463. @click.option("-c", "--commands", is_flag=True, default=False)
  464. @click.option("-f", "--filters", multiple=True, default=[])
  465. def main(
  466. session_url: Optional[str],
  467. instance_name: Optional[str] = None,
  468. commands: bool = False,
  469. filters: Optional[List[str]] = None,
  470. ):
  471. if filters and not commands:
  472. raise ValueError(
  473. "Must specify the command flag '-c' to use filter options '-f'."
  474. )
  475. random.seed(1235)
  476. logger = logging.getLogger("main")
  477. logger.setLevel(logging.INFO)
  478. handler = logging.StreamHandler()
  479. handler.setFormatter(
  480. logging.Formatter(
  481. "[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s"
  482. )
  483. )
  484. logger.addHandler(handler)
  485. maybe_fetch_buildkite_token()
  486. repro = ReproSession(
  487. os.environ["BUILDKITE_TOKEN"], instance_name=instance_name, logger=logger
  488. )
  489. session_url = session_url or click.prompt(
  490. "Please copy and paste the Buildkite job build URI here"
  491. )
  492. repro.set_session(session_url)
  493. repro.fetch_env_variables()
  494. repro.aws_start_instance()
  495. repro.aws_wait_for_instance()
  496. print("-" * 80)
  497. click.secho("Instance ID: ", nl=False)
  498. click.secho(repro.aws_instance_id, bold=True)
  499. click.secho("Instance IP: ", nl=False)
  500. click.secho(repro.aws_instance_ip, bold=True)
  501. print("-" * 80)
  502. logger.info(f"Instance IP: {repro.aws_instance_ip}")
  503. repro.prepare_instance()
  504. repro.docker_login()
  505. repro.fetch_buildkite_plugins()
  506. for plugin in repro.plugins:
  507. repro.install_buildkite_plugin(plugin)
  508. repro.execute_plugin_hook("dind", "pre-command")
  509. repro.execute_plugin_hook(
  510. "docker",
  511. "command",
  512. env={
  513. "BUILDKITE_COMMAND": "sleep infinity",
  514. "BUILDKITE_PLUGIN_DOCKER_TTY": "0",
  515. "BUILDKITE_PLUGIN_DOCKER_MOUNT_CHECKOUT": "0",
  516. },
  517. script_command=(
  518. "sed -E 's/"
  519. "docker run/"
  520. "docker run "
  521. "--cap-add=SYS_PTRACE "
  522. "--name ray_container "
  523. "-d/g' | "
  524. "bash -l"
  525. ),
  526. )
  527. repro.create_new_ssh_client()
  528. repro.print_buildkite_command()
  529. if commands:
  530. filters = filters or []
  531. repro.run_buildkite_command(command_filter=filters)
  532. repro.print_buildkite_command(skipped=True)
  533. repro.transfer_env_to_container()
  534. # Print once more before attaching
  535. click.secho("Instance ID: ", nl=False)
  536. click.secho(repro.aws_instance_id, bold=True)
  537. click.secho("Instance IP: ", nl=False)
  538. click.secho(repro.aws_instance_ip, bold=True)
  539. print("-" * 80)
  540. repro.attach_to_container()
  541. logger.info("You are now detached from the AWS instance.")
  542. if click.confirm("Stop AWS instance?", default=False):
  543. repro.aws_stop_instance()
  544. else:
  545. repro.print_stop_command()
  546. if __name__ == "__main__":
  547. main()