langconv.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # fork from https://github.com/skydark/nstools/blob/master/zhtools/langconv.py
  4. from copy import deepcopy
  5. try:
  6. import psyco
  7. psyco.full()
  8. except:
  9. pass
  10. from pycorrector.utils.zh_wiki import zh2Hant, zh2Hans
  11. import sys
  12. py3k = sys.version_info >= (3, 0, 0)
  13. if py3k:
  14. UEMPTY = ''
  15. else:
  16. _zh2Hant, _zh2Hans = {}, {}
  17. for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
  18. for k, v in old.items():
  19. new[k.decode('utf8')] = v.decode('utf8')
  20. zh2Hant = _zh2Hant
  21. zh2Hans = _zh2Hans
  22. UEMPTY = ''.decode('utf8')
  23. # states
  24. (START, END, FAIL, WAIT_TAIL) = list(range(4))
  25. # conditions
  26. (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
  27. MAPS = {}
  28. class Node(object):
  29. def __init__(self, from_word, to_word=None, is_tail=True,
  30. have_child=False):
  31. self.from_word = from_word
  32. if to_word is None:
  33. self.to_word = from_word
  34. self.data = (is_tail, have_child, from_word)
  35. self.is_original = True
  36. else:
  37. self.to_word = to_word or from_word
  38. self.data = (is_tail, have_child, to_word)
  39. self.is_original = False
  40. self.is_tail = is_tail
  41. self.have_child = have_child
  42. def is_original_long_word(self):
  43. return self.is_original and len(self.from_word) > 1
  44. def is_follow(self, chars):
  45. return chars != self.from_word[:-1]
  46. def __str__(self):
  47. return '<Node, %s, %s, %s, %s>' % (repr(self.from_word),
  48. repr(self.to_word), self.is_tail, self.have_child)
  49. __repr__ = __str__
  50. class ConvertMap(object):
  51. def __init__(self, name, mapping=None):
  52. self.name = name
  53. self._map = {}
  54. if mapping:
  55. self.set_convert_map(mapping)
  56. def set_convert_map(self, mapping):
  57. convert_map = {}
  58. have_child = {}
  59. max_key_length = 0
  60. for key in sorted(mapping.keys()):
  61. if len(key) > 1:
  62. for i in range(1, len(key)):
  63. parent_key = key[:i]
  64. have_child[parent_key] = True
  65. have_child[key] = False
  66. max_key_length = max(max_key_length, len(key))
  67. for key in sorted(have_child.keys()):
  68. convert_map[key] = (key in mapping, have_child[key],
  69. mapping.get(key, UEMPTY))
  70. self._map = convert_map
  71. self.max_key_length = max_key_length
  72. def __getitem__(self, k):
  73. try:
  74. is_tail, have_child, to_word = self._map[k]
  75. return Node(k, to_word, is_tail, have_child)
  76. except:
  77. return Node(k)
  78. def __contains__(self, k):
  79. return k in self._map
  80. def __len__(self):
  81. return len(self._map)
  82. class StatesMachineException(Exception): pass
  83. class StatesMachine(object):
  84. def __init__(self):
  85. self.state = START
  86. self.final = UEMPTY
  87. self.len = 0
  88. self.pool = UEMPTY
  89. def clone(self, pool):
  90. new = deepcopy(self)
  91. new.state = WAIT_TAIL
  92. new.pool = pool
  93. return new
  94. def feed(self, char, map):
  95. node = map[self.pool + char]
  96. if node.have_child:
  97. if node.is_tail:
  98. if node.is_original:
  99. cond = UNMATCHED_SWITCH
  100. else:
  101. cond = MATCHED_SWITCH
  102. else:
  103. cond = CONNECTOR
  104. else:
  105. if node.is_tail:
  106. cond = TAIL
  107. else:
  108. cond = ERROR
  109. new = None
  110. if cond == ERROR:
  111. self.state = FAIL
  112. elif cond == TAIL:
  113. if self.state == WAIT_TAIL and node.is_original_long_word():
  114. self.state = FAIL
  115. else:
  116. self.final += node.to_word
  117. self.len += 1
  118. self.pool = UEMPTY
  119. self.state = END
  120. elif self.state == START or self.state == WAIT_TAIL:
  121. if cond == MATCHED_SWITCH:
  122. new = self.clone(node.from_word)
  123. self.final += node.to_word
  124. self.len += 1
  125. self.state = END
  126. self.pool = UEMPTY
  127. elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
  128. if self.state == START:
  129. new = self.clone(node.from_word)
  130. self.final += node.to_word
  131. self.len += 1
  132. self.state = END
  133. else:
  134. if node.is_follow(self.pool):
  135. self.state = FAIL
  136. else:
  137. self.pool = node.from_word
  138. elif self.state == END:
  139. # END is a new START
  140. self.state = START
  141. new = self.feed(char, map)
  142. elif self.state == FAIL:
  143. raise StatesMachineException('Translate States Machine '
  144. 'have error with input data %s' % node)
  145. return new
  146. def __len__(self):
  147. return self.len + 1
  148. def __str__(self):
  149. return '<StatesMachine %s, pool: "%s", state: %s, final: %s>' % (
  150. id(self), self.pool, self.state, self.final)
  151. __repr__ = __str__
  152. class Converter(object):
  153. def __init__(self, to_encoding):
  154. self.to_encoding = to_encoding
  155. self.map = MAPS[to_encoding]
  156. self.start()
  157. def feed(self, char):
  158. branches = []
  159. for fsm in self.machines:
  160. new = fsm.feed(char, self.map)
  161. if new:
  162. branches.append(new)
  163. if branches:
  164. self.machines.extend(branches)
  165. self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
  166. all_ok = True
  167. for fsm in self.machines:
  168. if fsm.state != END:
  169. all_ok = False
  170. if all_ok:
  171. self._clean()
  172. return self.get_result()
  173. def _clean(self):
  174. if len(self.machines):
  175. self.machines.sort(key=lambda x: len(x))
  176. # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
  177. self.final += self.machines[0].final
  178. self.machines = [StatesMachine()]
  179. def start(self):
  180. self.machines = [StatesMachine()]
  181. self.final = UEMPTY
  182. def end(self):
  183. self.machines = [fsm for fsm in self.machines
  184. if fsm.state == FAIL or fsm.state == END]
  185. self._clean()
  186. def convert(self, string):
  187. self.start()
  188. for char in string:
  189. self.feed(char)
  190. self.end()
  191. return self.get_result()
  192. def get_result(self):
  193. return self.final
  194. def registery(name, mapping):
  195. global MAPS
  196. MAPS[name] = ConvertMap(name, mapping)
  197. registery('zh-hant', zh2Hant)
  198. registery('zh-hans', zh2Hans)
  199. del zh2Hant, zh2Hans
  200. def run():
  201. import sys
  202. from optparse import OptionParser
  203. parser = OptionParser()
  204. parser.add_option('-e', type='string', dest='encoding',
  205. help='encoding')
  206. parser.add_option('-f', type='string', dest='file_in',
  207. help='input file (- for stdin)')
  208. parser.add_option('-t', type='string', dest='file_out',
  209. help='output file')
  210. (options, args) = parser.parse_args()
  211. if not options.encoding:
  212. parser.error('encoding must be set')
  213. if options.file_in:
  214. if options.file_in == '-':
  215. file_in = sys.stdin
  216. else:
  217. file_in = open(options.file_in)
  218. else:
  219. file_in = sys.stdin
  220. if options.file_out:
  221. if options.file_out == '-':
  222. file_out = sys.stdout
  223. else:
  224. file_out = open(options.file_out, 'wb')
  225. else:
  226. file_out = sys.stdout
  227. c = Converter(options.encoding)
  228. for line in file_in:
  229. # print >> file_out, c.convert(line.rstrip('\n').decode(
  230. file_out.write(c.convert(line.rstrip('\n').decode(
  231. 'utf8')).encode('utf8'))
  232. if __name__ == '__main__':
  233. run()