multi_language.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. """
  2. Translate this project to other languages (experimental, please open an issue if there is any bug)
  3. Usage:
  4. 1. modify config.py, set your LLM_MODEL and API_KEY(s) to provide access to OPENAI (or any other LLM model provider)
  5. 2. modify LANG (below ↓)
  6. LANG = "English"
  7. 3. modify TransPrompt (below ↓)
  8. TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."
  9. 4. Run `python multi_language.py`.
  10. Note: You need to run it multiple times to increase translation coverage because GPT makes mistakes sometimes.
  11. (You can also run `CACHE_ONLY=True python multi_language.py` to use cached translation mapping)
  12. 5. Find the translated program in `multi-language\English\*`
  13. P.S.
  14. - The translation mapping will be stored in `docs/translation_xxxx.json`, you can revised mistaken translation there.
  15. - If you would like to share your `docs/translation_xxxx.json`, (so that everyone can use the cached & revised translation mapping), please open a Pull Request
  16. - If there is any translation error in `docs/translation_xxxx.json`, please open a Pull Request
  17. - Welcome any Pull Request, regardless of language
  18. """
  19. import os
  20. import json
  21. import functools
  22. import re
  23. import pickle
  24. import time
  25. from toolbox import get_conf
  26. CACHE_ONLY = os.environ.get('CACHE_ONLY', False)
  27. CACHE_FOLDER = get_conf('PATH_LOGGING')
  28. blacklist = ['multi-language', CACHE_FOLDER, '.git', 'private_upload', 'multi_language.py', 'build', '.github', '.vscode', '__pycache__', 'venv']
  29. # LANG = "TraditionalChinese"
  30. # TransPrompt = f"Replace each json value `#` with translated results in Traditional Chinese, e.g., \"原始文本\":\"翻譯後文字\". Keep Json format. Do not answer #."
  31. # LANG = "Japanese"
  32. # TransPrompt = f"Replace each json value `#` with translated results in Japanese, e.g., \"原始文本\":\"テキストの翻訳\". Keep Json format. Do not answer #."
  33. LANG = "English"
  34. TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."
  35. if not os.path.exists(CACHE_FOLDER):
  36. os.makedirs(CACHE_FOLDER)
  37. def lru_file_cache(maxsize=128, ttl=None, filename=None):
  38. """
  39. Decorator that caches a function's return value after being called with given arguments.
  40. It uses a Least Recently Used (LRU) cache strategy to limit the size of the cache.
  41. maxsize: Maximum size of the cache. Defaults to 128.
  42. ttl: Time-to-Live of the cache. If a value hasn't been accessed for `ttl` seconds, it will be evicted from the cache.
  43. filename: Name of the file to store the cache in. If not supplied, the function name + ".cache" will be used.
  44. """
  45. cache_path = os.path.join(CACHE_FOLDER, f"{filename}.cache") if filename is not None else None
  46. def decorator_function(func):
  47. cache = {}
  48. _cache_info = {
  49. "hits": 0,
  50. "misses": 0,
  51. "maxsize": maxsize,
  52. "currsize": 0,
  53. "ttl": ttl,
  54. "filename": cache_path,
  55. }
  56. @functools.wraps(func)
  57. def wrapper_function(*args, **kwargs):
  58. key = str((args, frozenset(kwargs)))
  59. if key in cache:
  60. if _cache_info["ttl"] is None or (cache[key][1] + _cache_info["ttl"]) >= time.time():
  61. _cache_info["hits"] += 1
  62. print(f'Warning, reading cache, last read {(time.time()-cache[key][1])//60} minutes ago'); time.sleep(2)
  63. cache[key][1] = time.time()
  64. return cache[key][0]
  65. else:
  66. del cache[key]
  67. result = func(*args, **kwargs)
  68. cache[key] = [result, time.time()]
  69. _cache_info["misses"] += 1
  70. _cache_info["currsize"] += 1
  71. if _cache_info["currsize"] > _cache_info["maxsize"]:
  72. oldest_key = None
  73. for k in cache:
  74. if oldest_key is None:
  75. oldest_key = k
  76. elif cache[k][1] < cache[oldest_key][1]:
  77. oldest_key = k
  78. del cache[oldest_key]
  79. _cache_info["currsize"] -= 1
  80. if cache_path is not None:
  81. with open(cache_path, "wb") as f:
  82. pickle.dump(cache, f)
  83. return result
  84. def cache_info():
  85. return _cache_info
  86. wrapper_function.cache_info = cache_info
  87. if cache_path is not None and os.path.exists(cache_path):
  88. with open(cache_path, "rb") as f:
  89. cache = pickle.load(f)
  90. _cache_info["currsize"] = len(cache)
  91. return wrapper_function
  92. return decorator_function
  93. def contains_chinese(string):
  94. """
  95. Returns True if the given string contains Chinese characters, False otherwise.
  96. """
  97. chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
  98. return chinese_regex.search(string) is not None
  99. def split_list(lst, n_each_req):
  100. """
  101. Split a list into smaller lists, each with a maximum number of elements.
  102. :param lst: the list to split
  103. :param n_each_req: the maximum number of elements in each sub-list
  104. :return: a list of sub-lists
  105. """
  106. result = []
  107. for i in range(0, len(lst), n_each_req):
  108. result.append(lst[i:i + n_each_req])
  109. return result
  110. def map_to_json(map, language):
  111. dict_ = read_map_from_json(language)
  112. dict_.update(map)
  113. with open(f'docs/translate_{language.lower()}.json', 'w', encoding='utf8') as f:
  114. json.dump(dict_, f, indent=4, ensure_ascii=False)
  115. def read_map_from_json(language):
  116. if os.path.exists(f'docs/translate_{language.lower()}.json'):
  117. with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
  118. res = json.load(f)
  119. res = {k:v for k, v in res.items() if v is not None and contains_chinese(k)}
  120. return res
  121. return {}
  122. def advanced_split(splitted_string, spliter, include_spliter=False):
  123. splitted_string_tmp = []
  124. for string_ in splitted_string:
  125. if spliter in string_:
  126. splitted = string_.split(spliter)
  127. for i, s in enumerate(splitted):
  128. if include_spliter:
  129. if i != len(splitted)-1:
  130. splitted[i] += spliter
  131. splitted[i] = splitted[i].strip()
  132. for i in reversed(range(len(splitted))):
  133. if not contains_chinese(splitted[i]):
  134. splitted.pop(i)
  135. splitted_string_tmp.extend(splitted)
  136. else:
  137. splitted_string_tmp.append(string_)
  138. splitted_string = splitted_string_tmp
  139. return splitted_string_tmp
  140. cached_translation = {}
  141. cached_translation = read_map_from_json(language=LANG)
  142. def trans(word_to_translate, language, special=False):
  143. if len(word_to_translate) == 0: return {}
  144. from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
  145. from toolbox import get_conf, ChatBotWithCookies, load_chat_cookies
  146. cookies = load_chat_cookies()
  147. llm_kwargs = {
  148. 'api_key': cookies['api_key'],
  149. 'llm_model': cookies['llm_model'],
  150. 'top_p':1.0,
  151. 'max_length': None,
  152. 'temperature':0.4,
  153. }
  154. import random
  155. N_EACH_REQ = random.randint(16, 32)
  156. word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
  157. inputs_array = [str(s) for s in word_to_translate_split]
  158. inputs_show_user_array = inputs_array
  159. history_array = [[] for _ in inputs_array]
  160. if special: # to English using CamelCase Naming Convention
  161. sys_prompt_array = [f"Translate following names to English with CamelCase naming convention. Keep original format" for _ in inputs_array]
  162. else:
  163. sys_prompt_array = [f"Translate following sentences to {LANG}. E.g., You should translate sentences to the following format ['translation of sentence 1', 'translation of sentence 2']. Do NOT answer with Chinese!" for _ in inputs_array]
  164. chatbot = ChatBotWithCookies(llm_kwargs)
  165. gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
  166. inputs_array,
  167. inputs_show_user_array,
  168. llm_kwargs,
  169. chatbot,
  170. history_array,
  171. sys_prompt_array,
  172. )
  173. while True:
  174. try:
  175. gpt_say = next(gpt_say_generator)
  176. print(gpt_say[1][0][1])
  177. except StopIteration as e:
  178. result = e.value
  179. break
  180. translated_result = {}
  181. for i, r in enumerate(result):
  182. if i%2 == 1:
  183. try:
  184. res_before_trans = eval(result[i-1])
  185. res_after_trans = eval(result[i])
  186. if len(res_before_trans) != len(res_after_trans):
  187. raise RuntimeError
  188. for a,b in zip(res_before_trans, res_after_trans):
  189. translated_result[a] = b
  190. except:
  191. # try:
  192. # res_before_trans = word_to_translate_split[(i-1)//2]
  193. # res_after_trans = [s for s in result[i].split("', '")]
  194. # for a,b in zip(res_before_trans, res_after_trans):
  195. # translated_result[a] = b
  196. # except:
  197. print('GPT answers with unexpected format, some words may not be translated, but you can try again later to increase translation coverage.')
  198. res_before_trans = eval(result[i-1])
  199. for a in res_before_trans:
  200. translated_result[a] = None
  201. return translated_result
  202. def trans_json(word_to_translate, language, special=False):
  203. if len(word_to_translate) == 0: return {}
  204. from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
  205. from toolbox import get_conf, ChatBotWithCookies, load_chat_cookies
  206. cookies = load_chat_cookies()
  207. llm_kwargs = {
  208. 'api_key': cookies['api_key'],
  209. 'llm_model': cookies['llm_model'],
  210. 'top_p':1.0,
  211. 'max_length': None,
  212. 'temperature':0.4,
  213. }
  214. import random
  215. N_EACH_REQ = random.randint(16, 32)
  216. random.shuffle(word_to_translate)
  217. word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
  218. inputs_array = [{k:"#" for k in s} for s in word_to_translate_split]
  219. inputs_array = [ json.dumps(i, ensure_ascii=False) for i in inputs_array]
  220. inputs_show_user_array = inputs_array
  221. history_array = [[] for _ in inputs_array]
  222. sys_prompt_array = [TransPrompt for _ in inputs_array]
  223. chatbot = ChatBotWithCookies(llm_kwargs)
  224. gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
  225. inputs_array,
  226. inputs_show_user_array,
  227. llm_kwargs,
  228. chatbot,
  229. history_array,
  230. sys_prompt_array,
  231. )
  232. while True:
  233. try:
  234. gpt_say = next(gpt_say_generator)
  235. print(gpt_say[1][0][1])
  236. except StopIteration as e:
  237. result = e.value
  238. break
  239. translated_result = {}
  240. for i, r in enumerate(result):
  241. if i%2 == 1:
  242. try:
  243. translated_result.update(json.loads(result[i]))
  244. except:
  245. print(result[i])
  246. print(result)
  247. return translated_result
  248. def step_1_core_key_translate():
  249. LANG_STD = 'std'
  250. def extract_chinese_characters(file_path):
  251. syntax = []
  252. with open(file_path, 'r', encoding='utf-8') as f:
  253. content = f.read()
  254. import ast
  255. root = ast.parse(content)
  256. for node in ast.walk(root):
  257. if isinstance(node, ast.Name):
  258. if contains_chinese(node.id): syntax.append(node.id)
  259. if isinstance(node, ast.Import):
  260. for n in node.names:
  261. if contains_chinese(n.name): syntax.append(n.name)
  262. elif isinstance(node, ast.ImportFrom):
  263. for n in node.names:
  264. if contains_chinese(n.name): syntax.append(n.name)
  265. # if node.module is None: print(node.module)
  266. for k in node.module.split('.'):
  267. if contains_chinese(k): syntax.append(k)
  268. return syntax
  269. def extract_chinese_characters_from_directory(directory_path):
  270. chinese_characters = []
  271. for root, dirs, files in os.walk(directory_path):
  272. if any([b in root for b in blacklist]):
  273. continue
  274. print(files)
  275. for file in files:
  276. if file.endswith('.py'):
  277. file_path = os.path.join(root, file)
  278. chinese_characters.extend(extract_chinese_characters(file_path))
  279. return chinese_characters
  280. directory_path = './'
  281. chinese_core_names = extract_chinese_characters_from_directory(directory_path)
  282. chinese_core_keys = [name for name in chinese_core_names]
  283. chinese_core_keys_norepeat = []
  284. for d in chinese_core_keys:
  285. if d not in chinese_core_keys_norepeat: chinese_core_keys_norepeat.append(d)
  286. need_translate = []
  287. cached_translation = read_map_from_json(language=LANG_STD)
  288. cached_translation_keys = list(cached_translation.keys())
  289. for d in chinese_core_keys_norepeat:
  290. if d not in cached_translation_keys:
  291. need_translate.append(d)
  292. if CACHE_ONLY:
  293. need_translate_mapping = {}
  294. else:
  295. need_translate_mapping = trans(need_translate, language=LANG_STD, special=True)
  296. map_to_json(need_translate_mapping, language=LANG_STD)
  297. cached_translation = read_map_from_json(language=LANG_STD)
  298. cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
  299. chinese_core_keys_norepeat_mapping = {}
  300. for k in chinese_core_keys_norepeat:
  301. chinese_core_keys_norepeat_mapping.update({k:cached_translation[k]})
  302. chinese_core_keys_norepeat_mapping = dict(sorted(chinese_core_keys_norepeat_mapping.items(), key=lambda x: -len(x[0])))
  303. # ===============================================
  304. # copy
  305. # ===============================================
  306. def copy_source_code():
  307. from toolbox import get_conf
  308. import shutil
  309. import os
  310. try: shutil.rmtree(f'./multi-language/{LANG}/')
  311. except: pass
  312. os.makedirs(f'./multi-language', exist_ok=True)
  313. backup_dir = f'./multi-language/{LANG}/'
  314. shutil.copytree('./', backup_dir, ignore=lambda x, y: blacklist)
  315. copy_source_code()
  316. # ===============================================
  317. # primary key replace
  318. # ===============================================
  319. directory_path = f'./multi-language/{LANG}/'
  320. for root, dirs, files in os.walk(directory_path):
  321. for file in files:
  322. if file.endswith('.py'):
  323. file_path = os.path.join(root, file)
  324. syntax = []
  325. # read again
  326. with open(file_path, 'r', encoding='utf-8') as f:
  327. content = f.read()
  328. for k, v in chinese_core_keys_norepeat_mapping.items():
  329. content = content.replace(k, v)
  330. with open(file_path, 'w', encoding='utf-8') as f:
  331. f.write(content)
  332. def step_2_core_key_translate():
  333. # =================================================================================================
  334. # step2
  335. # =================================================================================================
  336. def load_string(strings, string_input):
  337. string_ = string_input.strip().strip(',').strip().strip('.').strip()
  338. if string_.startswith('[Local Message]'):
  339. string_ = string_.replace('[Local Message]', '')
  340. string_ = string_.strip().strip(',').strip().strip('.').strip()
  341. splitted_string = [string_]
  342. # --------------------------------------
  343. splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
  344. splitted_string = advanced_split(splitted_string, spliter="。", include_spliter=False)
  345. splitted_string = advanced_split(splitted_string, spliter=")", include_spliter=False)
  346. splitted_string = advanced_split(splitted_string, spliter="(", include_spliter=False)
  347. splitted_string = advanced_split(splitted_string, spliter="(", include_spliter=False)
  348. splitted_string = advanced_split(splitted_string, spliter=")", include_spliter=False)
  349. splitted_string = advanced_split(splitted_string, spliter="<", include_spliter=False)
  350. splitted_string = advanced_split(splitted_string, spliter=">", include_spliter=False)
  351. splitted_string = advanced_split(splitted_string, spliter="[", include_spliter=False)
  352. splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
  353. splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
  354. splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
  355. splitted_string = advanced_split(splitted_string, spliter="?", include_spliter=False)
  356. splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
  357. splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
  358. splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
  359. splitted_string = advanced_split(splitted_string, spliter="#", include_spliter=False)
  360. splitted_string = advanced_split(splitted_string, spliter="\n", include_spliter=False)
  361. splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
  362. splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
  363. splitted_string = advanced_split(splitted_string, spliter=" ", include_spliter=False)
  364. splitted_string = advanced_split(splitted_string, spliter="- ", include_spliter=False)
  365. splitted_string = advanced_split(splitted_string, spliter="---", include_spliter=False)
  366. # --------------------------------------
  367. for j, s in enumerate(splitted_string): # .com
  368. if '.com' in s: continue
  369. if '\'' in s: continue
  370. if '\"' in s: continue
  371. strings.append([s,0])
  372. def get_strings(node):
  373. strings = []
  374. # recursively traverse the AST
  375. for child in ast.iter_child_nodes(node):
  376. node = child
  377. if isinstance(child, ast.Str):
  378. if contains_chinese(child.s):
  379. load_string(strings=strings, string_input=child.s)
  380. elif isinstance(child, ast.AST):
  381. strings.extend(get_strings(child))
  382. return strings
  383. string_literals = []
  384. directory_path = f'./multi-language/{LANG}/'
  385. for root, dirs, files in os.walk(directory_path):
  386. for file in files:
  387. if file.endswith('.py'):
  388. file_path = os.path.join(root, file)
  389. syntax = []
  390. with open(file_path, 'r', encoding='utf-8') as f:
  391. content = f.read()
  392. # comments
  393. comments_arr = []
  394. for code_sp in content.splitlines():
  395. comments = re.findall(r'#.*$', code_sp)
  396. for comment in comments:
  397. load_string(strings=comments_arr, string_input=comment)
  398. string_literals.extend(comments_arr)
  399. # strings
  400. import ast
  401. tree = ast.parse(content)
  402. res = get_strings(tree, )
  403. string_literals.extend(res)
  404. [print(s) for s in string_literals]
  405. chinese_literal_names = []
  406. chinese_literal_names_norepeat = []
  407. for string, offset in string_literals:
  408. chinese_literal_names.append(string)
  409. chinese_literal_names_norepeat = []
  410. for d in chinese_literal_names:
  411. if d not in chinese_literal_names_norepeat: chinese_literal_names_norepeat.append(d)
  412. need_translate = []
  413. cached_translation = read_map_from_json(language=LANG)
  414. cached_translation_keys = list(cached_translation.keys())
  415. for d in chinese_literal_names_norepeat:
  416. if d not in cached_translation_keys:
  417. need_translate.append(d)
  418. if CACHE_ONLY:
  419. up = {}
  420. else:
  421. up = trans_json(need_translate, language=LANG, special=False)
  422. map_to_json(up, language=LANG)
  423. cached_translation = read_map_from_json(language=LANG)
  424. LANG_STD = 'std'
  425. cached_translation.update(read_map_from_json(language=LANG_STD))
  426. cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
  427. # ===============================================
  428. # literal key replace
  429. # ===============================================
  430. directory_path = f'./multi-language/{LANG}/'
  431. for root, dirs, files in os.walk(directory_path):
  432. for file in files:
  433. if file.endswith('.py'):
  434. file_path = os.path.join(root, file)
  435. syntax = []
  436. # read again
  437. with open(file_path, 'r', encoding='utf-8') as f:
  438. content = f.read()
  439. for k, v in cached_translation.items():
  440. if v is None: continue
  441. if '"' in v:
  442. v = v.replace('"', "`")
  443. if '\'' in v:
  444. v = v.replace('\'', "`")
  445. content = content.replace(k, v)
  446. with open(file_path, 'w', encoding='utf-8') as f:
  447. f.write(content)
  448. if file.strip('.py') in cached_translation:
  449. file_new = cached_translation[file.strip('.py')] + '.py'
  450. file_path_new = os.path.join(root, file_new)
  451. with open(file_path_new, 'w', encoding='utf-8') as f:
  452. f.write(content)
  453. os.remove(file_path)
  454. step_1_core_key_translate()
  455. step_2_core_key_translate()
  456. print('Finished, checkout generated results at ./multi-language/')