ngram_util.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author:XuMing(xuming624@qq.com)
  4. @description:
  5. """
  6. class NgramUtil:
  7. @staticmethod
  8. def unigrams(words):
  9. """
  10. Input: a list of words, e.g., ["I", "am", "Denny"]
  11. Output: a list of unigram
  12. """
  13. assert type(words) == list
  14. return words
  15. @staticmethod
  16. def bigrams(words, join_string, skip=0):
  17. """
  18. Input: a list of words, e.g., ["I", "am", "Denny"]
  19. Output: a list of bigram, e.g., ["I_am", "am_Denny"]
  20. """
  21. assert type(words) == list
  22. L = len(words)
  23. if L > 1:
  24. lst = []
  25. for i in range(L - 1):
  26. for k in range(1, skip + 2):
  27. if i + k < L:
  28. lst.append(join_string.join([words[i], words[i + k]]))
  29. else:
  30. # set it as unigram
  31. lst = NgramUtil.unigrams(words)
  32. return lst
  33. @staticmethod
  34. def trigrams(words, join_string, skip=0):
  35. """
  36. Input: a list of words, e.g., ["I", "am", "Denny"]
  37. Output: a list of trigram, e.g., ["I_am_Denny"]
  38. """
  39. assert type(words) == list
  40. L = len(words)
  41. if L > 2:
  42. lst = []
  43. for i in range(L - 2):
  44. for k1 in range(1, skip + 2):
  45. for k2 in range(1, skip + 2):
  46. if i + k1 < L and i + k1 + k2 < L:
  47. lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
  48. else:
  49. # set it as bigram
  50. lst = NgramUtil.bigrams(words, join_string, skip)
  51. return lst
  52. @staticmethod
  53. def fourgrams(words, join_string):
  54. """
  55. Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
  56. Output: a list of trigram, e.g., ["I_am_Denny_boy"]
  57. """
  58. assert type(words) == list
  59. L = len(words)
  60. if L > 3:
  61. lst = []
  62. for i in range(L - 3):
  63. lst.append(join_string.join([words[i], words[i + 1], words[i + 2], words[i + 3]]))
  64. else:
  65. # set it as trigram
  66. lst = NgramUtil.trigrams(words, join_string)
  67. return lst
  68. @staticmethod
  69. def uniterms(words):
  70. return NgramUtil.unigrams(words)
  71. @staticmethod
  72. def biterms(words, join_string):
  73. """
  74. Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
  75. Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
  76. """
  77. assert type(words) == list
  78. L = len(words)
  79. if L > 1:
  80. lst = []
  81. for i in range(L - 1):
  82. for j in range(i + 1, L):
  83. lst.append(join_string.join([words[i], words[j]]))
  84. else:
  85. # set it as uniterm
  86. lst = NgramUtil.uniterms(words)
  87. return lst
  88. @staticmethod
  89. def triterms(words, join_string):
  90. """
  91. Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
  92. Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
  93. """
  94. assert type(words) == list
  95. L = len(words)
  96. if L > 2:
  97. lst = []
  98. for i in range(L - 2):
  99. for j in range(i + 1, L - 1):
  100. for k in range(j + 1, L):
  101. lst.append(join_string.join([words[i], words[j], words[k]]))
  102. else:
  103. # set it as biterm
  104. lst = NgramUtil.biterms(words, join_string)
  105. return lst
  106. @staticmethod
  107. def fourterms(words, join_string):
  108. """
  109. Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
  110. Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
  111. """
  112. assert type(words) == list
  113. L = len(words)
  114. if L > 3:
  115. lst = []
  116. for i in range(L - 3):
  117. for j in range(i + 1, L - 2):
  118. for k in range(j + 1, L - 1):
  119. for l in range(k + 1, L):
  120. lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
  121. else:
  122. # set it as triterm
  123. lst = NgramUtil.triterms(words, join_string)
  124. return lst
  125. @staticmethod
  126. def ngrams(words, ngram, join_string=" "):
  127. """
  128. wrapper for ngram
  129. """
  130. ngram = int(ngram)
  131. if ngram == 1:
  132. return NgramUtil.unigrams(words)
  133. elif ngram == 2:
  134. return NgramUtil.bigrams(words, join_string)
  135. elif ngram == 3:
  136. return NgramUtil.trigrams(words, join_string)
  137. elif ngram == 4:
  138. return NgramUtil.fourgrams(words, join_string)
  139. elif ngram == 12:
  140. unigram = NgramUtil.unigrams(words)
  141. bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
  142. return unigram + bigram
  143. elif ngram == 123:
  144. unigram = NgramUtil.unigrams(words)
  145. bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
  146. trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
  147. return unigram + bigram + trigram
  148. elif ngram == 1234:
  149. unigram = NgramUtil.unigrams(words)
  150. bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
  151. trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
  152. fourgram = [x for x in NgramUtil.fourgrams(words, join_string) if len(x.split(join_string)) == 4]
  153. return unigram + bigram + trigram + fourgram
  154. @staticmethod
  155. def nterms(words, nterm, join_string=" "):
  156. """wrapper for nterm"""
  157. if nterm == 1:
  158. return NgramUtil.uniterms(words)
  159. elif nterm == 2:
  160. return NgramUtil.biterms(words, join_string)
  161. elif nterm == 3:
  162. return NgramUtil.triterms(words, join_string)
  163. elif nterm == 4:
  164. return NgramUtil.fourterms(words, join_string)