text_utils.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author:XuMing(xuming624@qq.com)
  4. @description: 汉字处理的工具:判断unicode是否是汉字,数字,英文,或者其他字符。以及全角符号转半角符号。
  5. """
  6. import re
  7. import pypinyin
  8. from pypinyin import pinyin
  9. from pycorrector.utils.langconv import Converter
  10. def is_chinese_char(uchar):
  11. """判断一个unicode是否是汉字"""
  12. return '\u4e00' <= uchar <= '\u9fa5'
  13. def is_chinese_string(string):
  14. """判断是否全为汉字"""
  15. return all(is_chinese_char(c) for c in string)
  16. def is_number(uchar):
  17. """判断一个unicode是否是数字"""
  18. return '\u0030' <= uchar <= '\u0039'
  19. def is_alphabet(uchar):
  20. """判断一个unicode是否是英文字母"""
  21. return '\u0041' <= uchar <= '\u005a' or '\u0061' <= uchar <= '\u007a'
  22. def is_alphabet_string(string):
  23. """判断是否全部为英文字母"""
  24. return all(is_alphabet(c) for c in string)
  25. def is_alphabet_number_string(string):
  26. """判断全是数字和英文字符"""
  27. return all((is_alphabet(c) or is_number(c)) for c in string)
  28. def is_other(uchar):
  29. """判断是否非汉字,数字和英文字符"""
  30. return not (is_chinese_char(uchar) or is_number(uchar) or is_alphabet(uchar))
  31. def B2Q(uchar):
  32. """半角转全角"""
  33. inside_code = ord(uchar)
  34. if inside_code < 0x0020 or inside_code > 0x7e: # 不是半角字符就返回原来的字符
  35. return uchar
  36. if inside_code == 0x0020: # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
  37. inside_code = 0x3000
  38. else:
  39. inside_code += 0xfee0
  40. return chr(inside_code)
  41. def Q2B(uchar):
  42. """全角转半角"""
  43. inside_code = ord(uchar)
  44. if inside_code == 0x3000:
  45. inside_code = 0x0020
  46. else:
  47. inside_code -= 0xfee0
  48. if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
  49. return uchar
  50. return chr(inside_code)
  51. def stringQ2B(ustring):
  52. """把字符串全角转半角"""
  53. return "".join([Q2B(uchar) for uchar in ustring])
  54. def uniform(ustring):
  55. """格式化字符串,完成全角转半角,大写转小写的工作"""
  56. return stringQ2B(ustring).lower()
  57. def remove_punctuation(strs):
  58. """
  59. 去除标点符号
  60. :param strs:
  61. :return:
  62. """
  63. return re.sub(r"[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", strs.strip())
  64. def traditional2simplified(sentence):
  65. """
  66. 将sentence中的繁体字转为简体字
  67. :param sentence: 待转换的句子
  68. :return: 将句子中繁体字转换为简体字之后的句子
  69. """
  70. return Converter('zh-hans').convert(sentence)
  71. def simplified2traditional(sentence):
  72. """
  73. 将sentence中的简体字转为繁体字
  74. :param sentence: 待转换的句子
  75. :return: 将句子中简体字转换为繁体字之后的句子
  76. """
  77. return Converter('zh-hant').convert(sentence)
  78. def get_homophones_by_char(input_char):
  79. """
  80. 根据汉字取同音字
  81. :param input_char:
  82. :return:
  83. """
  84. result = []
  85. # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
  86. for i in range(0x4e00, 0x9fa6):
  87. if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
  88. result.append(chr(i))
  89. return result
  90. def get_homophones_by_pinyin(input_pinyin):
  91. """
  92. 根据拼音取同音字
  93. :param input_pinyin:
  94. :return:
  95. """
  96. result = []
  97. # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
  98. for i in range(0x4e00, 0x9fa6):
  99. if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
  100. # TONE2: 中zho1ng
  101. result.append(chr(i))
  102. return result
  103. if __name__ == "__main__":
  104. a = 'nihao'
  105. print(a, is_alphabet_string(a))
  106. # test Q2B and B2Q
  107. for i in range(0x0020, 0x007F):
  108. print(Q2B(B2Q(chr(i))), B2Q(chr(i)))
  109. # test uniform
  110. ustring = '中国 人名a高频A 扇'
  111. ustring = uniform(ustring)
  112. print(ustring)
  113. print(is_other(','))
  114. print(uniform('你干么!d7&888学英 语ABC?nz'))
  115. print(is_chinese_char('喜'))
  116. print(is_chinese_string('喜,'))
  117. print(is_chinese_string('丽,'))
  118. traditional_sentence = '憂郁的臺灣烏龜'
  119. simplified_sentence = traditional2simplified(traditional_sentence)
  120. print(traditional_sentence, simplified_sentence)
  121. print(is_alphabet_string('Teacher'))
  122. print(is_alphabet_string('Teacher '))
  123. print('*' * 12)
  124. print(is_alphabet_number_string('nihao123'))
  125. print(is_alphabet_number_string('ni*hao12'))
  126. print(is_alphabet_number_string('12'))
  127. print(is_alphabet_number_string('teacher'))
  128. print(is_alphabet_number_string('oppo12'))
  129. print(is_alphabet_number_string('oppo12 '))