build_collections.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import os
  2. import sys
  3. from glob import glob
  4. from typing import List
  5. from xml.etree import ElementTree
  6. from nltk.downloader import _indent_xml
  7. if len(sys.argv) != 2:
  8. print("Usage: ")
  9. print("build_collections.py <path-to-packages>")
  10. sys.exit(-1)
  11. ROOT = sys.argv[1]
  12. def write(file_name: str, coll_name: str, items: List[str]) -> None:
  13. """Write `collection/{file_name}.xml` with `file_name` as the collection `id`,
  14. `coll_name` as the collection `name`, and `items` as a list of collection items.
  15. :param file_name: The id of the collection, equivalent to the file name,
  16. e.g. `all-corpora`.
  17. :type file_name: str
  18. :param coll_name: The name of the collection, e.g. `"All corpora"`
  19. :type coll_name: str
  20. :param items: A list of names for the collection items, e.g. `["abc", "alpino", ...]`
  21. :type items: List[str]
  22. """
  23. et = ElementTree.Element("collection", id=file_name, name=coll_name)
  24. et.extend(ElementTree.Element("item", ref=item) for item in sorted(items))
  25. _indent_xml(et)
  26. with open(os.path.join(ROOT, "collections", file_name + ".xml"), "w", encoding="utf8") as f:
  27. f.write(ElementTree.tostring(et).decode("utf8"))
  28. def get_id(xml_path: str) -> str:
  29. """Given a full path, extract only the filename (i.e. the nltk_data id)
  30. :param xml_path: A full path, e.g. "./packages/corpora/abc.xml"
  31. :type xml_path: str
  32. :return: The filename, without the extension, e.g. "abc"
  33. :rtype: str
  34. """
  35. return os.path.splitext(os.path.basename(xml_path))[0]
  36. # Write `collection/all-corpora.xml` based on all files under /packages/corpora
  37. corpora_items = [get_id(xml_path) for xml_path in glob(f"{ROOT}/packages/corpora/*.xml")]
  38. write("all-corpora", "All the corpora", corpora_items)
  39. # Write `collection/all-nltk.xml` and `collection/all.xml` based on all files under /packages
  40. all_items = [get_id(xml_path) for xml_path in glob(f"{ROOT}/packages/**/*.xml")]
  41. write("all-nltk", "All packages available on nltk_data gh-pages branch", all_items)
  42. write("all", "All packages", all_items)