tokenizer.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author:XuMing(xuming624@qq.com)
  4. @description: 切词器
  5. """
  6. import os
  7. import re
  8. import jieba
  9. from jieba import posseg
  10. from pycorrector.utils.text_utils import is_chinese_string
  11. jieba.setLogLevel(log_level="ERROR")
  12. def split_text_into_sentences_by_symbol(text, include_symbol=True):
  13. """
  14. 将文本切分为句子,以标点符号为分隔符
  15. :param text: str
  16. :param include_symbol: bool
  17. :return: list, (sentence, idx)
  18. """
  19. # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
  20. # \r\n|\s : whitespace characters. Will not be handled.
  21. re_han = re.compile("([\u4E00-\u9Fa5a-zA-Z0-9+#&]+)", re.U)
  22. re_skip = re.compile("(\r\n\\s)", re.U)
  23. result = []
  24. sentences = re_han.split(text)
  25. start_idx = 0
  26. for sentence in sentences:
  27. if not sentence:
  28. continue
  29. if include_symbol:
  30. result.append((sentence, start_idx))
  31. else:
  32. if re_han.match(sentence):
  33. result.append((sentence, start_idx))
  34. start_idx += len(sentence)
  35. return result
  36. def split_text_into_sentences_by_length(text, length=512):
  37. """
  38. 将文本切分为固定长度的句子
  39. :param text: str
  40. :param length: int, 每个句子的最大长度
  41. :return: list, (sentence, idx)
  42. """
  43. result = []
  44. for i in range(0, len(text), length):
  45. result.append((text[i:i + length], i))
  46. return result
  47. def tokenize_words(text):
  48. """Word segmentation"""
  49. output = []
  50. sentences = split_text_into_sentences_by_symbol(text, include_symbol=True)
  51. for sentence, idx in sentences:
  52. if is_chinese_string(sentence):
  53. output.extend(jieba.lcut(sentence))
  54. else:
  55. output.extend(whitespace_tokenize(sentence))
  56. return output
  57. def whitespace_tokenize(text):
  58. """Runs basic whitespace cleaning and splitting on a piece of text."""
  59. tokens = []
  60. if not text:
  61. return tokens
  62. sents = split_text_into_sentences_by_symbol(text, include_symbol=True)
  63. for sent, idx in sents:
  64. tokens.extend(sent.split())
  65. return tokens
  66. class FullTokenizer(object):
  67. """Given Full tokenization."""
  68. def __init__(self, lower=True):
  69. self.lower = lower
  70. def tokenize(self, text):
  71. """Tokenizes a piece of text."""
  72. res = []
  73. if len(text) == 0:
  74. return res
  75. if self.lower:
  76. text = text.lower()
  77. # for the multilingual and Chinese
  78. res = tokenize_words(text)
  79. return res
  80. def segment(sentence, cut_type='word', pos=False):
  81. """
  82. 切词
  83. :param sentence:
  84. :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
  85. :param pos: enable POS
  86. :return: list
  87. """
  88. if pos:
  89. if cut_type == 'word':
  90. word_pos_seq = posseg.lcut(sentence)
  91. word_seq, pos_seq = [], []
  92. for w, p in word_pos_seq:
  93. word_seq.append(w)
  94. pos_seq.append(p)
  95. return word_seq, pos_seq
  96. elif cut_type == 'char':
  97. word_seq = list(sentence)
  98. pos_seq = []
  99. for w in word_seq:
  100. w_p = posseg.lcut(w)
  101. pos_seq.append(w_p[0].flag)
  102. return word_seq, pos_seq
  103. else:
  104. if cut_type == 'word':
  105. return jieba.lcut(sentence)
  106. elif cut_type == 'char':
  107. return list(sentence)
  108. class Tokenizer(object):
  109. def __init__(self, dict_path='', custom_word_freq_dict=None, custom_confusion_dict=None):
  110. self.model = jieba
  111. # 初始化大词典
  112. if os.path.exists(dict_path):
  113. self.model.set_dictionary(dict_path)
  114. # 加载用户自定义词典
  115. if custom_word_freq_dict:
  116. for w, f in custom_word_freq_dict.items():
  117. self.model.add_word(w, freq=f)
  118. # 加载混淆集词典
  119. if custom_confusion_dict:
  120. for k, word in custom_confusion_dict.items():
  121. # 添加到分词器的自定义词典中
  122. self.model.add_word(k)
  123. self.model.add_word(word)
  124. def tokenize(self, unicode_sentence, mode="search"):
  125. """
  126. 切词并返回切词位置, search mode用于错误扩召回
  127. :param unicode_sentence: query
  128. :param mode: search, default, ngram
  129. :param HMM: enable HMM
  130. :return: (w, start, start + width) model='default'
  131. """
  132. if mode == 'ngram':
  133. n = 2
  134. result_set = set()
  135. tokens = self.model.lcut(unicode_sentence)
  136. tokens_len = len(tokens)
  137. start = 0
  138. for i in range(0, tokens_len):
  139. w = tokens[i]
  140. width = len(w)
  141. result_set.add((w, start, start + width))
  142. for j in range(i, i + n):
  143. gram = "".join(tokens[i:j + 1])
  144. gram_width = len(gram)
  145. if i + j > tokens_len:
  146. break
  147. result_set.add((gram, start, start + gram_width))
  148. start += width
  149. results = list(result_set)
  150. result = sorted(results, key=lambda x: x[-1])
  151. else:
  152. result = list(self.model.tokenize(unicode_sentence, mode=mode))
  153. return result
  154. if __name__ == '__main__':
  155. text = "这个消息在北京城里不胫儿走,你好,我才来到这里。你呢?"
  156. print(text)
  157. t = Tokenizer()
  158. print('deault', t.tokenize(text, 'default'))
  159. print('search', t.tokenize(text, 'search'))
  160. print('ngram', t.tokenize(text, 'ngram'))
  161. paragraph = "The first time I heard that song was in Hawaii on radio. " \
  162. "I was just a kid, and loved it very much! What a fantastic song!"
  163. cutwords1 = whitespace_tokenize(paragraph) # 分词
  164. print('【my分词结果:】', cutwords1)
  165. print('----\n', text)
  166. r = split_text_into_sentences_by_symbol(text, include_symbol=True)
  167. print('split_2_short_text:',r)
  168. r = split_text_into_sentences_by_length(text, 4)
  169. print('split_text_by_maxlen:',r)