check.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. #!/usr/bin/env python3
  2. ############################################################################
  3. #
  4. # Licensed to the Apache Software Foundation (ASF) under one or more
  5. # contributor license agreements. See the NOTICE file distributed with
  6. # this work for additional information regarding copyright ownership. The
  7. # ASF licenses this file to you under the Apache License, Version 2.0 (the
  8. # "License"); you may not use this file except in compliance with the
  9. # License. You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16. # License for the specific language governing permissions and limitations
  17. # under the License.
  18. #
  19. ############################################################################
  20. import getopt
  21. import json
  22. import os
  23. import re
  24. import subprocess
  25. import sys
  26. import termcolor
  27. committers_json = None
  28. non_commiters_json = None
  29. author_mappings_json = None
  30. verbose_level = 0
  31. color = True
  32. def colored(s, c):
  33. if color:
  34. return termcolor.colored(s, c)
  35. else:
  36. return s
  37. def commit_attributions(c):
  38. regex = re.compile("(?i)(?:by|from|author|Co-authored-by):? +(.+)")
  39. return re.findall(regex, c["message"]) + re.findall(regex, c["body"])
  40. def get_headers(s):
  41. return re.findall("(?i)/\*\*\*.+?(?:Copyright).+?\*\*\*+/", s, re.DOTALL)
  42. def get_file(blob):
  43. try:
  44. return subprocess.check_output(
  45. ["git", "cat-file", "-p", blob], stderr=subprocess.DEVNULL
  46. ).decode()
  47. except subprocess.CalledProcessError:
  48. return None
  49. def header_authors(header):
  50. results = re.findall("[Aa]uthors?: +(.+?) *(?:Redistribution)", header, re.DOTALL)
  51. results = [re.split("\n[ *]+", result) for result in results]
  52. results = sum(results, []) # flatten
  53. results = [
  54. re.sub("[Cc]opyright:?( ?.[Cc].)? *([12][0-9]{3}[,-]? ?)", "", result)
  55. for result in results
  56. ]
  57. results = list(filter(lambda s: s != "", results)) # remove empty strings
  58. return results
  59. # Search for an author name in Apache's committers/non-committers
  60. # database. It will return (apacheID,name) if there's a match or
  61. # None if not. apacheID might be None if there's no Apache ID
  62. # for author
  63. def search_for_cla(name):
  64. for k, v in committers_json["committers"].items():
  65. if v == name:
  66. return (k, v)
  67. if name in non_committers_json["non_committers"]:
  68. return (None, name)
  69. return None
  70. # Returns the same as above, but this takes an author
  71. # (which may include an email include an email used
  72. # to look for alternative author names for this person)
  73. def author_has_cla(author):
  74. if "@" in author:
  75. matches = re.match("^(.+?)(?: +([^ ]+@[^ ]+ *))$", author)
  76. if not matches:
  77. return None # found an '@' but it wasn't an email, so this is most likely not really an author
  78. name = matches.group(1)
  79. email = matches.group(2).lstrip("<").rstrip(">")
  80. else:
  81. name = author.strip()
  82. email = None
  83. vvvprint("name: %s email: %s" % (name, email if email else "?"))
  84. # first look for name directly
  85. result = search_for_cla(name)
  86. if result:
  87. return result
  88. # otherwise, get all available alternative names for author
  89. # and look for each
  90. if email and (email in author_mappings_json):
  91. result = search_for_cla(author_mappings_json[email])
  92. if result:
  93. return result
  94. # Nothing matched
  95. return None
  96. def header_copyrights(header):
  97. results = re.findall(
  98. " \* *[Cc]opyright:?(?: ?.[Cc].)? *(?:[12][0-9]{3}[,-]? ?)* *(.+)", header
  99. )
  100. return [re.sub("(. )?[Aa]ll rights reserved.?", "", result) for result in results]
  101. def report_cla(author):
  102. cla = author_has_cla(author)
  103. if cla:
  104. (apacheid, name) = cla
  105. print(colored("✓", "green"), end=" ")
  106. else:
  107. apacheid = None
  108. print(colored("✗", "red"), end=" ")
  109. if apacheid:
  110. print("%s (ID: %s)" % (author, apacheid))
  111. else:
  112. print(author)
  113. def analyze(j):
  114. complete_attributions = set()
  115. complete_authors = set()
  116. complete_copyrights = set()
  117. vprint("file has %i commits" % len(j))
  118. for commit in j:
  119. authors = set()
  120. vprint(colored("-", "yellow"))
  121. vprint(colored("commit: ", "green") + commit["commit"])
  122. vprint(colored("blob: ", "green") + commit["blob"])
  123. vprint(colored("date: ", "green") + commit["date"])
  124. vprint(
  125. colored("author: ", "green")
  126. + ("%s <%s>" % (commit["author"], commit["author-email"]))
  127. )
  128. attributions = commit_attributions(commit)
  129. if len(attributions) > 0:
  130. vprint(colored("attributions:", "green"))
  131. for attribution in attributions:
  132. vprint(attribution)
  133. complete_attributions |= set(attributions)
  134. complete_authors |= set([commit["author"] + " " + commit["author-email"]])
  135. # skip deletion commits
  136. vprint(colored("blob:", "green"), end=" ")
  137. if commit["blob"] == "0000000000000000000000000000000000000000":
  138. vprint("zero (deletion)")
  139. continue
  140. file_contents = get_file(commit["blob"])
  141. # skip inaccessible blobs (probably lived in a submodule)
  142. if not file_contents:
  143. vprint("inaccessible")
  144. continue
  145. else:
  146. vprint("available")
  147. headers = get_headers(file_contents)
  148. vprint(colored("header authors:", "green"))
  149. for header in headers:
  150. ha = header_authors(header)
  151. authors |= set(ha)
  152. vprint(ha)
  153. complete_authors |= set(authors)
  154. vprint(colored("header copyrights:", "green"))
  155. copyrights = set()
  156. for header in headers:
  157. hc = header_copyrights(header)
  158. copyrights |= set(hc)
  159. vprint(hc)
  160. vprint(colored("commit description:", "green"))
  161. vprint(commit["message"])
  162. if commit["body"]:
  163. vprint(colored("commit msg body:", "green"))
  164. vprint(commit["body"])
  165. vvprint(colored("headers:", "green"))
  166. for header in headers:
  167. vvprint(header)
  168. complete_copyrights |= copyrights
  169. vprint(colored("----\n", "yellow"))
  170. print(colored("COMPLETE REPORT:", "blue"))
  171. print(colored("attributions:", "green"))
  172. if len(complete_attributions) == 0:
  173. print("*none detected*")
  174. else:
  175. for attribution in complete_attributions:
  176. report_cla(attribution)
  177. print(colored("authors:", "green"))
  178. for author in complete_authors:
  179. report_cla(author)
  180. print(colored("copyrights:", "green"))
  181. print("\n".join(complete_copyrights))
  182. def print_help():
  183. print("Usage: check.py [-v] [-n] <JSON file>\n")
  184. print(
  185. " -v\tIncrease verbosity (add up to three times)\n"
  186. " -n\tDo not use color for output"
  187. )
  188. def vprint(*args, **kwargs):
  189. if verbose_level > 0:
  190. print(*args, **kwargs)
  191. def vvprint(*args, **kwargs):
  192. if verbose_level > 1:
  193. print(*args, **kwargs)
  194. def vvvprint(*args, **kwargs):
  195. if verbose_level > 2:
  196. print(*args, **kwargs)
  197. #####
  198. # First try to load the CLAs JSONs:
  199. try:
  200. with open(
  201. os.path.dirname(os.path.abspath(__file__)) + "/icla-info.json", "r"
  202. ) as file:
  203. committers_json = json.load(file)
  204. with open(
  205. os.path.dirname(os.path.abspath(__file__)) + "/icla-info_noid.json", "r"
  206. ) as file:
  207. non_committers_json = json.load(file)
  208. except Exception:
  209. print(
  210. "Could not open CLA JSON files, please read README.md for download instructions"
  211. )
  212. sys.exit(2)
  213. # Open author mappings JSON
  214. with open(
  215. os.path.dirname(os.path.abspath(__file__)) + "/author_mappings.json", "r"
  216. ) as file:
  217. author_mappings_json = json.load(file)
  218. try:
  219. opts, args = getopt.getopt(sys.argv[1:], "hnv")
  220. except getopt.GetoptError:
  221. print_help()
  222. sys.exit(2)
  223. for opt, arg in opts:
  224. if opt == "-h":
  225. print_help()
  226. sys.exit()
  227. elif opt == "-v":
  228. verbose_level = verbose_level + 1
  229. elif opt == "-n":
  230. color = False
  231. if len(args) != 1:
  232. print_help()
  233. sys.exit(2)
  234. f = args[0]
  235. if not f:
  236. print_help()
  237. sys.exit(2)
  238. if f == "-":
  239. j = json.load(sys.stdin)
  240. else:
  241. with open(f, "r") as file:
  242. j = json.load(file)
  243. analyze(j)