gingo
/
corrector


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
							# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 切词器
"""

import os
import re

import jieba
from jieba import posseg

from pycorrector.utils.text_utils import is_chinese_string

jieba.setLogLevel(log_level="ERROR")


def split_text_into_sentences_by_symbol(text, include_symbol=True):
    """
    将文本切分为句子，以标点符号为分隔符
    :param text: str
    :param include_symbol: bool
    :return: list, (sentence, idx)
    """
    # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
    # \r\n|\s : whitespace characters. Will not be handled.
    re_han = re.compile("([\u4E00-\u9Fa5a-zA-Z0-9+#&]+)", re.U)
    re_skip = re.compile("(\r\n\\s)", re.U)

    result = []
    sentences = re_han.split(text)
    start_idx = 0
    for sentence in sentences:
        if not sentence:
            continue
        if include_symbol:
            result.append((sentence, start_idx))
        else:
            if re_han.match(sentence):
                result.append((sentence, start_idx))
        start_idx += len(sentence)
    return result


def split_text_into_sentences_by_length(text, length=512):
    """
    将文本切分为固定长度的句子
    :param text: str
    :param length: int, 每个句子的最大长度
    :return: list, (sentence, idx)
    """
    result = []
    for i in range(0, len(text), length):
        result.append((text[i:i + length], i))
    return result


def tokenize_words(text):
    """Word segmentation"""
    output = []
    sentences = split_text_into_sentences_by_symbol(text, include_symbol=True)
    for sentence, idx in sentences:
        if is_chinese_string(sentence):
            output.extend(jieba.lcut(sentence))
        else:
            output.extend(whitespace_tokenize(sentence))
    return output


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    tokens = []
    if not text:
        return tokens
    sents = split_text_into_sentences_by_symbol(text, include_symbol=True)
    for sent, idx in sents:
        tokens.extend(sent.split())
    return tokens


class FullTokenizer(object):
    """Given Full tokenization."""

    def __init__(self, lower=True):
        self.lower = lower

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        res = []
        if len(text) == 0:
            return res

        if self.lower:
            text = text.lower()
        # for the multilingual and Chinese
        res = tokenize_words(text)
        return res


def segment(sentence, cut_type='word', pos=False):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :return: list
    """
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)


class Tokenizer(object):
    def __init__(self, dict_path='', custom_word_freq_dict=None, custom_confusion_dict=None):
        self.model = jieba
        # 初始化大词典
        if os.path.exists(dict_path):
            self.model.set_dictionary(dict_path)
        # 加载用户自定义词典
        if custom_word_freq_dict:
            for w, f in custom_word_freq_dict.items():
                self.model.add_word(w, freq=f)

        # 加载混淆集词典
        if custom_confusion_dict:
            for k, word in custom_confusion_dict.items():
                # 添加到分词器的自定义词典中
                self.model.add_word(k)
                self.model.add_word(word)

    def tokenize(self, unicode_sentence, mode="search"):
        """
        切词并返回切词位置, search mode用于错误扩召回
        :param unicode_sentence: query
        :param mode: search, default, ngram
        :param HMM: enable HMM
        :return: (w, start, start + width) model='default'
        """
        if mode == 'ngram':
            n = 2
            result_set = set()
            tokens = self.model.lcut(unicode_sentence)
            tokens_len = len(tokens)
            start = 0
            for i in range(0, tokens_len):
                w = tokens[i]
                width = len(w)
                result_set.add((w, start, start + width))
                for j in range(i, i + n):
                    gram = "".join(tokens[i:j + 1])
                    gram_width = len(gram)
                    if i + j > tokens_len:
                        break
                    result_set.add((gram, start, start + gram_width))
                start += width
            results = list(result_set)
            result = sorted(results, key=lambda x: x[-1])
        else:
            result = list(self.model.tokenize(unicode_sentence, mode=mode))
        return result


if __name__ == '__main__':
    text = "这个消息在北京城里不胫儿走，你好，我才来到这里。你呢？"
    print(text)

    t = Tokenizer()
    print('deault', t.tokenize(text, 'default'))
    print('search', t.tokenize(text, 'search'))
    print('ngram', t.tokenize(text, 'ngram'))

    paragraph = "The first time I heard that song was in Hawaii on radio. " \
                "I was just a kid, and loved it very much! What a fantastic song!"
    cutwords1 = whitespace_tokenize(paragraph)  # 分词
    print('【my分词结果：】', cutwords1)

    print('----\n', text)
    r = split_text_into_sentences_by_symbol(text, include_symbol=True)
    print('split_2_short_text:',r)
    r = split_text_into_sentences_by_length(text, 4)
    print('split_text_by_maxlen:',r)