py_dep_analysis.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. #!/usr/bin/env python
  2. #
  3. # This file contains utilities for understanding dependencies between python
  4. # source files and tests.
  5. #
  6. # Utils are assumed to be used from top level ray/ folder, since that is how
  7. # our tests are defined today.
  8. #
  9. # Example usage:
  10. # To find all circular dependencies under ray/python/:
  11. # python ci/pipeline/py_dep_analysis.py --mode=circular-dep
  12. # To find all the RLlib tests that depend on a file:
  13. # python ci/pipeline/py_dep_analysis.py --mode=test-dep \
  14. # --file=python/ray/tune/tune.py
  15. # For testing, add --smoke-test to any commands, so it doesn't spend
  16. # tons of time querying for available RLlib tests.
  17. import argparse
  18. import ast
  19. import os
  20. import re
  21. import subprocess
  22. import sys
  23. from typing import Dict, List, Tuple
  24. class DepGraph(object):
  25. def __init__(self):
  26. self.edges: Dict[str, Dict[str, bool]] = {}
  27. self.ids: Dict[str, int] = {}
  28. self.inv_ids: Dict[int, str] = {}
  29. def _run_shell(args: List[str]) -> str:
  30. return subprocess.check_output(args).decode(sys.stdout.encoding)
  31. def list_rllib_tests(n: int = -1, test: str = None) -> Tuple[str, List[str]]:
  32. """List RLlib tests.
  33. Args:
  34. n: return at most n tests. all tests if n = -1.
  35. test: only return information about a specific test.
  36. """
  37. tests_res = _run_shell(
  38. ["bazel", "query", "tests(//python/ray/rllib:*)", "--output", "label"]
  39. )
  40. all_tests = []
  41. # Strip, also skip any empty lines
  42. tests = [t.strip() for t in tests_res.splitlines() if t.strip()]
  43. for t in tests:
  44. if test and t != test:
  45. continue
  46. src_out = _run_shell(
  47. [
  48. "bazel",
  49. "query",
  50. 'kind("source file", deps({}))'.format(t),
  51. "--output",
  52. "label",
  53. ]
  54. )
  55. srcs = [f.strip() for f in src_out.splitlines()]
  56. srcs = [f for f in srcs if f.startswith("//python") and f.endswith(".py")]
  57. if srcs:
  58. all_tests.append((t, srcs))
  59. # Break early if smoke test.
  60. if n > 0 and len(all_tests) >= n:
  61. break
  62. return all_tests
  63. def _new_dep(graph: DepGraph, src_module: str, dep: str):
  64. """Create a new dependency between src_module and dep."""
  65. if dep not in graph.ids:
  66. graph.ids[dep] = len(graph.ids)
  67. src_id = graph.ids[src_module]
  68. dep_id = graph.ids[dep]
  69. if src_id not in graph.edges:
  70. graph.edges[src_id] = {}
  71. graph.edges[src_id][dep_id] = True
  72. def _new_import(graph: DepGraph, src_module: str, dep_module: str):
  73. """Process a new import statement in src_module."""
  74. # We don't care about system imports.
  75. if not dep_module.startswith("ray"):
  76. return
  77. _new_dep(graph, src_module, dep_module)
  78. def _is_path_module(module: str, name: str, _base_dir: str) -> bool:
  79. """Figure out if base.sub is a python module or not."""
  80. # Special handling for _raylet, which is a C++ lib.
  81. if module == "ray._raylet":
  82. return False
  83. bps = ["python"] + module.split(".")
  84. path = os.path.join(_base_dir, os.path.join(*bps), name + ".py")
  85. if os.path.isfile(path):
  86. return True # file module
  87. return False
  88. def _new_from_import(
  89. graph: DepGraph, src_module: str, dep_module: str, dep_name: str, _base_dir: str
  90. ):
  91. """Process a new "from ... import ..." statement in src_module."""
  92. # We don't care about imports outside of ray package.
  93. if not dep_module or not dep_module.startswith("ray"):
  94. return
  95. if _is_path_module(dep_module, dep_name, _base_dir):
  96. # dep_module.dep_name points to a file.
  97. _new_dep(graph, src_module, _full_module_path(dep_module, dep_name))
  98. else:
  99. # sub is an obj on base dir/file.
  100. _new_dep(graph, src_module, dep_module)
  101. def _process_file(graph: DepGraph, src_path: str, src_module: str, _base_dir=""):
  102. """Create dependencies from src_module to all the valid imports in src_path.
  103. Args:
  104. graph: the DepGraph to be added to.
  105. src_path: .py file to be processed.
  106. src_module: full module path of the source file.
  107. _base_dir: use a different base dir than current dir. For unit testing.
  108. """
  109. with open(os.path.join(_base_dir, src_path), "r") as in_f:
  110. tree = ast.parse(in_f.read())
  111. for node in ast.walk(tree):
  112. if isinstance(node, ast.Import):
  113. for alias in node.names:
  114. _new_import(graph, src_module, alias.name)
  115. elif isinstance(node, ast.ImportFrom):
  116. for alias in node.names:
  117. _new_from_import(
  118. graph, src_module, node.module, alias.name, _base_dir
  119. )
  120. def build_dep_graph() -> DepGraph:
  121. """Build index from py files to their immediate dependees."""
  122. graph = DepGraph()
  123. # Assuming we run from root /ray directory.
  124. # Follow links since rllib is linked to /rllib.
  125. for root, sub_dirs, files in os.walk("python", followlinks=True):
  126. if _should_skip(root):
  127. continue
  128. module = _bazel_path_to_module_path(root)
  129. # Process files first.
  130. for f in files:
  131. if not f.endswith(".py"):
  132. continue
  133. full = _full_module_path(module, f)
  134. if full not in graph.ids:
  135. graph.ids[full] = len(graph.ids)
  136. # Process file:
  137. _process_file(graph, os.path.join(root, f), full)
  138. # Build reverse index for convenience.
  139. graph.inv_ids = {v: k for k, v in graph.ids.items()}
  140. return graph
  141. def _full_module_path(module, f) -> str:
  142. if f == "__init__.py":
  143. # __init__ file for this module.
  144. # Full path is the same as the module name.
  145. return module
  146. fn = re.sub(r"\.py$", "", f)
  147. if not module:
  148. return fn
  149. return module + "." + fn
  150. def _should_skip(d: str) -> bool:
  151. """Skip directories that should not contain py sources."""
  152. if d.startswith("python/.eggs/"):
  153. return True
  154. if d.startswith("python/."):
  155. return True
  156. if d.startswith("python/build"):
  157. return True
  158. if d.startswith("python/ray/cpp"):
  159. return True
  160. return False
  161. def _bazel_path_to_module_path(d: str) -> str:
  162. """Convert a Bazel file path to python module path.
  163. Example: //python/ray/rllib:xxx/yyy/dd -> ray.rllib.xxx.yyy.dd
  164. """
  165. # Do this in 3 steps, so all of 'python:', 'python/', or '//python', etc
  166. # will get stripped.
  167. d = re.sub(r"^\/\/", "", d)
  168. d = re.sub(r"^python", "", d)
  169. d = re.sub(r"^[\/:]", "", d)
  170. return d.replace("/", ".").replace(":", ".")
  171. def _file_path_to_module_path(f: str) -> str:
  172. """Return the corresponding module path for a .py file."""
  173. dir, fn = os.path.split(f)
  174. return _full_module_path(_bazel_path_to_module_path(dir), fn)
  175. def _depends(
  176. graph: DepGraph, visited: Dict[int, bool], tid: int, qid: int
  177. ) -> List[int]:
  178. """Whether there is a dependency path from module tid to module qid.
  179. Given graph, and without going through visited.
  180. """
  181. if tid not in graph.edges or qid not in graph.edges:
  182. return []
  183. if qid in graph.edges[tid]:
  184. # tid directly depends on qid.
  185. return [tid, qid]
  186. for c in graph.edges[tid]:
  187. if c in visited:
  188. continue
  189. visited[c] = True
  190. # Reduce to a question of whether there is a path from c to qid.
  191. ds = _depends(graph, visited, c, qid)
  192. if ds:
  193. # From tid -> c -> qid.
  194. return [tid] + ds
  195. return []
  196. def test_depends_on_file(
  197. graph: DepGraph, test: Tuple[str, Tuple[str]], path: str
  198. ) -> List[int]:
  199. """Give dependency graph, check if a test depends on a specific .py file.
  200. Args:
  201. graph: the dependency graph.
  202. test: information about a test, in the format of:
  203. [test_name, (src files for the test)]
  204. """
  205. query = _file_path_to_module_path(path)
  206. if query not in graph.ids:
  207. # Not a file that we care about.
  208. return []
  209. t, srcs = test
  210. # Skip tuned_examples/ and examples/ tests.
  211. if t.startswith("//python/ray/rllib:examples/"):
  212. return []
  213. for src in srcs:
  214. if src == "ray.rllib.tests.run_regression_tests":
  215. return []
  216. tid = _file_path_to_module_path(src)
  217. if tid not in graph.ids:
  218. # Not a test that we care about.
  219. # TODO(jungong): What tests are these?????
  220. continue
  221. branch = _depends(graph, {}, graph.ids[tid], graph.ids[query])
  222. if branch:
  223. return branch
  224. # Does not depend on file.
  225. return []
  226. def _find_circular_dep_impl(graph: DepGraph, id: str, branch: str) -> bool:
  227. if id not in graph.edges:
  228. return False
  229. for c in graph.edges[id]:
  230. if c in branch:
  231. # Found a circle.
  232. branch.append(c)
  233. return True
  234. branch.append(c)
  235. if _find_circular_dep_impl(graph, c, branch):
  236. return True
  237. branch.pop()
  238. return False
  239. def find_circular_dep(graph: DepGraph) -> Dict[str, List[int]]:
  240. """Find circular dependencies among a dependency graph."""
  241. known = {}
  242. circles = {}
  243. for m, id in graph.ids.items():
  244. branch = []
  245. if _find_circular_dep_impl(graph, id, branch):
  246. if branch[-1] in known:
  247. # Already knew, skip.
  248. continue
  249. # Since this is a cycle dependency, any step along the circle
  250. # will form a different circle.
  251. # So we mark every entry on this circle known.
  252. for n in branch:
  253. known[n] = True
  254. # Mark that module m contains a potential circular dep.
  255. circles[m] = branch
  256. return circles
  257. if __name__ == "__main__":
  258. parser = argparse.ArgumentParser()
  259. parser.add_argument(
  260. "--mode",
  261. type=str,
  262. default="test-dep",
  263. help=(
  264. "test-dep: find dependencies for a specified test. "
  265. "circular-dep: find circular dependencies in "
  266. "the specific codebase."
  267. ),
  268. )
  269. parser.add_argument(
  270. "--file", type=str, help="Path of a .py source file relative to --base_dir."
  271. )
  272. parser.add_argument("--test", type=str, help="Specific test to check.")
  273. parser.add_argument(
  274. "--smoke-test", action="store_true", help="Load only a few tests for testing."
  275. )
  276. args = parser.parse_args()
  277. print("building dep graph ...")
  278. graph = build_dep_graph()
  279. print(
  280. "done. total {} files, {} of which have dependencies.".format(
  281. len(graph.ids), len(graph.edges)
  282. )
  283. )
  284. if args.mode == "circular-dep":
  285. circles = find_circular_dep(graph)
  286. print("Found following circular dependencies: \n")
  287. for m, b in circles.items():
  288. print(m)
  289. for n in b:
  290. print(" ", graph.inv_ids[n])
  291. print()
  292. if args.mode == "test-dep":
  293. assert args.file, "Must specify --file for the query."
  294. # Only support RLlib tests for now.
  295. # The way Tune tests are defined, they all depend on
  296. # the entire tune codebase.
  297. tests = list_rllib_tests(5 if args.smoke_test else -1, args.test)
  298. print("Total # of tests: ", len(tests))
  299. for t in tests:
  300. branch = test_depends_on_file(graph, t, args.file)
  301. if branch:
  302. print("{} depends on {}".format(t[0], args.file))
  303. # Print some debugging info.
  304. for n in branch:
  305. print(" ", graph.inv_ids[n])
  306. else:
  307. print("{} does not depend on {}".format(t[0], args.file))