load_doc_cache.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. import boto3
  2. import botocore
  3. import subprocess
  4. import tarfile
  5. import os
  6. import click
  7. from botocore import UNSIGNED
  8. from botocore.client import Config
  9. import time
  10. import requests
  11. S3_BUCKET = "ray-ci-results"
  12. DOC_BUILD_DIR_S3 = "doc_build"
  13. LAST_BUILD_CUTOFF = 3 # how many days ago to consider a build outdated
  14. PENDING_FILES_PATH = "pending_files.txt"
  15. ENVIRONMENT_PICKLE = "_build/doctrees/environment.pickle"
  16. DOC_BUILD_S3_URL = "https://ray-ci-results.s3.us-west-2.amazonaws.com/doc_build"
  17. def find_latest_master_commit():
  18. """Find latest commit that was pushed to origin/master that is also on local env."""
  19. latest_commits = (
  20. subprocess.check_output(
  21. [
  22. "git",
  23. "log",
  24. "-n",
  25. "100",
  26. "--format=%H",
  27. ]
  28. )
  29. .strip()
  30. .decode("utf-8")
  31. .split("\n")
  32. )
  33. for commit in latest_commits:
  34. result = requests.head(f"{DOC_BUILD_S3_URL}/{commit}.tgz")
  35. if result.status_code == 200:
  36. return commit
  37. raise Exception(
  38. "No cache found for latest master commit."
  39. "Please merge with upstream master or use 'make develop'."
  40. )
  41. def fetch_cache_from_s3(commit, target_file_path):
  42. """
  43. Fetch doc cache archive from ray-ci-results S3 bucket
  44. Args:
  45. commit: The commit hash of the doc cache to fetch
  46. target_file_path: The file path to save the doc cache archive
  47. """
  48. # Create an S3 client
  49. s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
  50. s3_file_path = f"{DOC_BUILD_DIR_S3}/{commit}.tgz"
  51. try:
  52. print(f"Fetching doc cache from commit {commit}...")
  53. s3.download_file(S3_BUCKET, s3_file_path, target_file_path)
  54. print(f"Successfully downloaded {s3_file_path} to {target_file_path}")
  55. except botocore.exceptions.ClientError as e:
  56. print(f"Failed to download {s3_file_path} from S3: {str(e)}")
  57. raise e
  58. def extract_cache(cache_path: str, doc_dir: str):
  59. """
  60. Extract the doc cache archive to overwrite the ray/doc directory
  61. Args:
  62. file_path: The file path of the doc cache archive
  63. """
  64. with tarfile.open(cache_path, "r:gz") as tar:
  65. tar.extractall(doc_dir)
  66. print(f"Extracted {cache_path} to {doc_dir}")
  67. def list_changed_and_added_files(ray_dir: str, latest_master_commit: str):
  68. """
  69. List all changed and added untracked files in the repo.
  70. This is to prevent cache environment from updating timestamp of these files.
  71. """
  72. untracked_files = (
  73. subprocess.check_output(
  74. ["git", "ls-files", "--others"],
  75. cwd=ray_dir,
  76. )
  77. .decode("utf-8")
  78. .split(os.linesep)
  79. )
  80. modified_files = (
  81. subprocess.check_output(
  82. ["git", "ls-files", "--modified"],
  83. cwd=ray_dir,
  84. )
  85. .decode("utf-8")
  86. .split(os.linesep)
  87. )
  88. diff_files_with_master = (
  89. subprocess.check_output(
  90. ["git", "diff", "--name-only", latest_master_commit],
  91. cwd=ray_dir,
  92. )
  93. .decode("utf-8")
  94. .split(os.linesep)
  95. )
  96. filenames = []
  97. for file in untracked_files + modified_files + diff_files_with_master:
  98. filename = file
  99. if filename.startswith("doc/"): # Remove "doc/" prefix
  100. filename = filename.replace("doc/", "")
  101. if filename.startswith("source/"): # Remove "doc/" prefix
  102. filename = filename.replace("source/", "")
  103. filenames.append(filename)
  104. return filenames
  105. def should_load_cache(ray_dir: str):
  106. """
  107. Check if cache should be loaded based on the timestamp of last build.
  108. """
  109. ray_doc_dir = os.path.join(ray_dir, "doc")
  110. if not os.path.exists(f"{ray_doc_dir}/{ENVIRONMENT_PICKLE}"):
  111. print("Doc build environment pickle file does not exist.")
  112. return True
  113. last_build_time = os.path.getmtime(f"{ray_doc_dir}/{ENVIRONMENT_PICKLE}")
  114. current_time = time.time()
  115. # Load cache if last build was more than LAST_BUILD_CUTOFF days ago
  116. print("time diff: ", current_time - last_build_time)
  117. if current_time - last_build_time > LAST_BUILD_CUTOFF * 60 * 60 * 24:
  118. print(f"Last build was more than {LAST_BUILD_CUTOFF} days ago.")
  119. return True
  120. return False
  121. @click.command()
  122. @click.option("--ray-dir", default="/ray", help="Path to Ray repo")
  123. def main(ray_dir: str) -> None:
  124. if not should_load_cache(ray_dir):
  125. print("Skip loading global cache...")
  126. return
  127. print("Loading global cache ...")
  128. latest_master_commit = find_latest_master_commit()
  129. # List all changed and added files in the repo
  130. filenames = list_changed_and_added_files(ray_dir, latest_master_commit)
  131. with open(
  132. f"{ray_dir}/{PENDING_FILES_PATH}", "w"
  133. ) as f: # Save to file to be used when updating cache environment
  134. f.write("\n".join(filenames))
  135. cache_path = f"{ray_dir}/doc.tgz"
  136. # Fetch cache of that commit from S3 to cache_path
  137. fetch_cache_from_s3(latest_master_commit, cache_path)
  138. # Extract cache to override ray/doc directory
  139. extract_cache(cache_path, f"{ray_dir}/doc")
  140. os.remove(cache_path)
  141. if __name__ == "__main__":
  142. main()