123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- # -*- coding: utf-8 -*-
- """
- @author:XuMing(xuming624@qq.com)
- @description: 切词器
- """
- import os
- import re
- import jieba
- from jieba import posseg
- from pycorrector.utils.text_utils import is_chinese_string
- jieba.setLogLevel(log_level="ERROR")
- def split_text_into_sentences_by_symbol(text, include_symbol=True):
- """
- 将文本切分为句子,以标点符号为分隔符
- :param text: str
- :param include_symbol: bool
- :return: list, (sentence, idx)
- """
- # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
- # \r\n|\s : whitespace characters. Will not be handled.
- re_han = re.compile("([\u4E00-\u9Fa5a-zA-Z0-9+#&]+)", re.U)
- re_skip = re.compile("(\r\n\\s)", re.U)
- result = []
- sentences = re_han.split(text)
- start_idx = 0
- for sentence in sentences:
- if not sentence:
- continue
- if include_symbol:
- result.append((sentence, start_idx))
- else:
- if re_han.match(sentence):
- result.append((sentence, start_idx))
- start_idx += len(sentence)
- return result
- def split_text_into_sentences_by_length(text, length=512):
- """
- 将文本切分为固定长度的句子
- :param text: str
- :param length: int, 每个句子的最大长度
- :return: list, (sentence, idx)
- """
- result = []
- for i in range(0, len(text), length):
- result.append((text[i:i + length], i))
- return result
- def tokenize_words(text):
- """Word segmentation"""
- output = []
- sentences = split_text_into_sentences_by_symbol(text, include_symbol=True)
- for sentence, idx in sentences:
- if is_chinese_string(sentence):
- output.extend(jieba.lcut(sentence))
- else:
- output.extend(whitespace_tokenize(sentence))
- return output
- def whitespace_tokenize(text):
- """Runs basic whitespace cleaning and splitting on a piece of text."""
- tokens = []
- if not text:
- return tokens
- sents = split_text_into_sentences_by_symbol(text, include_symbol=True)
- for sent, idx in sents:
- tokens.extend(sent.split())
- return tokens
- class FullTokenizer(object):
- """Given Full tokenization."""
- def __init__(self, lower=True):
- self.lower = lower
- def tokenize(self, text):
- """Tokenizes a piece of text."""
- res = []
- if len(text) == 0:
- return res
- if self.lower:
- text = text.lower()
- # for the multilingual and Chinese
- res = tokenize_words(text)
- return res
- def segment(sentence, cut_type='word', pos=False):
- """
- 切词
- :param sentence:
- :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
- :param pos: enable POS
- :return: list
- """
- if pos:
- if cut_type == 'word':
- word_pos_seq = posseg.lcut(sentence)
- word_seq, pos_seq = [], []
- for w, p in word_pos_seq:
- word_seq.append(w)
- pos_seq.append(p)
- return word_seq, pos_seq
- elif cut_type == 'char':
- word_seq = list(sentence)
- pos_seq = []
- for w in word_seq:
- w_p = posseg.lcut(w)
- pos_seq.append(w_p[0].flag)
- return word_seq, pos_seq
- else:
- if cut_type == 'word':
- return jieba.lcut(sentence)
- elif cut_type == 'char':
- return list(sentence)
- class Tokenizer(object):
- def __init__(self, dict_path='', custom_word_freq_dict=None, custom_confusion_dict=None):
- self.model = jieba
- # 初始化大词典
- if os.path.exists(dict_path):
- self.model.set_dictionary(dict_path)
- # 加载用户自定义词典
- if custom_word_freq_dict:
- for w, f in custom_word_freq_dict.items():
- self.model.add_word(w, freq=f)
- # 加载混淆集词典
- if custom_confusion_dict:
- for k, word in custom_confusion_dict.items():
- # 添加到分词器的自定义词典中
- self.model.add_word(k)
- self.model.add_word(word)
- def tokenize(self, unicode_sentence, mode="search"):
- """
- 切词并返回切词位置, search mode用于错误扩召回
- :param unicode_sentence: query
- :param mode: search, default, ngram
- :param HMM: enable HMM
- :return: (w, start, start + width) model='default'
- """
- if mode == 'ngram':
- n = 2
- result_set = set()
- tokens = self.model.lcut(unicode_sentence)
- tokens_len = len(tokens)
- start = 0
- for i in range(0, tokens_len):
- w = tokens[i]
- width = len(w)
- result_set.add((w, start, start + width))
- for j in range(i, i + n):
- gram = "".join(tokens[i:j + 1])
- gram_width = len(gram)
- if i + j > tokens_len:
- break
- result_set.add((gram, start, start + gram_width))
- start += width
- results = list(result_set)
- result = sorted(results, key=lambda x: x[-1])
- else:
- result = list(self.model.tokenize(unicode_sentence, mode=mode))
- return result
- if __name__ == '__main__':
- text = "这个消息在北京城里不胫儿走,你好,我才来到这里。你呢?"
- print(text)
- t = Tokenizer()
- print('deault', t.tokenize(text, 'default'))
- print('search', t.tokenize(text, 'search'))
- print('ngram', t.tokenize(text, 'ngram'))
- paragraph = "The first time I heard that song was in Hawaii on radio. " \
- "I was just a kid, and loved it very much! What a fantastic song!"
- cutwords1 = whitespace_tokenize(paragraph) # 分词
- print('【my分词结果:】', cutwords1)
- print('----\n', text)
- r = split_text_into_sentences_by_symbol(text, include_symbol=True)
- print('split_2_short_text:',r)
- r = split_text_into_sentences_by_length(text, 4)
- print('split_text_by_maxlen:',r)
|