123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527 |
- """
- Translate this project to other languages (experimental, please open an issue if there is any bug)
- Usage:
- 1. modify config.py, set your LLM_MODEL and API_KEY(s) to provide access to OPENAI (or any other LLM model provider)
- 2. modify LANG (below ↓)
- LANG = "English"
- 3. modify TransPrompt (below ↓)
- TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."
- 4. Run `python multi_language.py`.
- Note: You need to run it multiple times to increase translation coverage because GPT makes mistakes sometimes.
- (You can also run `CACHE_ONLY=True python multi_language.py` to use cached translation mapping)
- 5. Find the translated program in `multi-language\English\*`
- P.S.
- - The translation mapping will be stored in `docs/translation_xxxx.json`, you can revised mistaken translation there.
- - If you would like to share your `docs/translation_xxxx.json`, (so that everyone can use the cached & revised translation mapping), please open a Pull Request
- - If there is any translation error in `docs/translation_xxxx.json`, please open a Pull Request
- - Welcome any Pull Request, regardless of language
- """
- import os
- import json
- import functools
- import re
- import pickle
- import time
- from toolbox import get_conf
- CACHE_ONLY = os.environ.get('CACHE_ONLY', False)
- CACHE_FOLDER = get_conf('PATH_LOGGING')
- blacklist = ['multi-language', CACHE_FOLDER, '.git', 'private_upload', 'multi_language.py', 'build', '.github', '.vscode', '__pycache__', 'venv']
- # LANG = "TraditionalChinese"
- # TransPrompt = f"Replace each json value `#` with translated results in Traditional Chinese, e.g., \"原始文本\":\"翻譯後文字\". Keep Json format. Do not answer #."
- # LANG = "Japanese"
- # TransPrompt = f"Replace each json value `#` with translated results in Japanese, e.g., \"原始文本\":\"テキストの翻訳\". Keep Json format. Do not answer #."
- LANG = "English"
- TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."
- if not os.path.exists(CACHE_FOLDER):
- os.makedirs(CACHE_FOLDER)
- def lru_file_cache(maxsize=128, ttl=None, filename=None):
- """
- Decorator that caches a function's return value after being called with given arguments.
- It uses a Least Recently Used (LRU) cache strategy to limit the size of the cache.
- maxsize: Maximum size of the cache. Defaults to 128.
- ttl: Time-to-Live of the cache. If a value hasn't been accessed for `ttl` seconds, it will be evicted from the cache.
- filename: Name of the file to store the cache in. If not supplied, the function name + ".cache" will be used.
- """
- cache_path = os.path.join(CACHE_FOLDER, f"{filename}.cache") if filename is not None else None
- def decorator_function(func):
- cache = {}
- _cache_info = {
- "hits": 0,
- "misses": 0,
- "maxsize": maxsize,
- "currsize": 0,
- "ttl": ttl,
- "filename": cache_path,
- }
- @functools.wraps(func)
- def wrapper_function(*args, **kwargs):
- key = str((args, frozenset(kwargs)))
- if key in cache:
- if _cache_info["ttl"] is None or (cache[key][1] + _cache_info["ttl"]) >= time.time():
- _cache_info["hits"] += 1
- print(f'Warning, reading cache, last read {(time.time()-cache[key][1])//60} minutes ago'); time.sleep(2)
- cache[key][1] = time.time()
- return cache[key][0]
- else:
- del cache[key]
- result = func(*args, **kwargs)
- cache[key] = [result, time.time()]
- _cache_info["misses"] += 1
- _cache_info["currsize"] += 1
- if _cache_info["currsize"] > _cache_info["maxsize"]:
- oldest_key = None
- for k in cache:
- if oldest_key is None:
- oldest_key = k
- elif cache[k][1] < cache[oldest_key][1]:
- oldest_key = k
- del cache[oldest_key]
- _cache_info["currsize"] -= 1
- if cache_path is not None:
- with open(cache_path, "wb") as f:
- pickle.dump(cache, f)
- return result
- def cache_info():
- return _cache_info
- wrapper_function.cache_info = cache_info
- if cache_path is not None and os.path.exists(cache_path):
- with open(cache_path, "rb") as f:
- cache = pickle.load(f)
- _cache_info["currsize"] = len(cache)
- return wrapper_function
- return decorator_function
- def contains_chinese(string):
- """
- Returns True if the given string contains Chinese characters, False otherwise.
- """
- chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
- return chinese_regex.search(string) is not None
- def split_list(lst, n_each_req):
- """
- Split a list into smaller lists, each with a maximum number of elements.
- :param lst: the list to split
- :param n_each_req: the maximum number of elements in each sub-list
- :return: a list of sub-lists
- """
- result = []
- for i in range(0, len(lst), n_each_req):
- result.append(lst[i:i + n_each_req])
- return result
- def map_to_json(map, language):
- dict_ = read_map_from_json(language)
- dict_.update(map)
- with open(f'docs/translate_{language.lower()}.json', 'w', encoding='utf8') as f:
- json.dump(dict_, f, indent=4, ensure_ascii=False)
- def read_map_from_json(language):
- if os.path.exists(f'docs/translate_{language.lower()}.json'):
- with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
- res = json.load(f)
- res = {k:v for k, v in res.items() if v is not None and contains_chinese(k)}
- return res
- return {}
- def advanced_split(splitted_string, spliter, include_spliter=False):
- splitted_string_tmp = []
- for string_ in splitted_string:
- if spliter in string_:
- splitted = string_.split(spliter)
- for i, s in enumerate(splitted):
- if include_spliter:
- if i != len(splitted)-1:
- splitted[i] += spliter
- splitted[i] = splitted[i].strip()
- for i in reversed(range(len(splitted))):
- if not contains_chinese(splitted[i]):
- splitted.pop(i)
- splitted_string_tmp.extend(splitted)
- else:
- splitted_string_tmp.append(string_)
- splitted_string = splitted_string_tmp
- return splitted_string_tmp
- cached_translation = {}
- cached_translation = read_map_from_json(language=LANG)
- def trans(word_to_translate, language, special=False):
- if len(word_to_translate) == 0: return {}
- from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
- from toolbox import get_conf, ChatBotWithCookies, load_chat_cookies
- cookies = load_chat_cookies()
- llm_kwargs = {
- 'api_key': cookies['api_key'],
- 'llm_model': cookies['llm_model'],
- 'top_p':1.0,
- 'max_length': None,
- 'temperature':0.4,
- }
- import random
- N_EACH_REQ = random.randint(16, 32)
- word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
- inputs_array = [str(s) for s in word_to_translate_split]
- inputs_show_user_array = inputs_array
- history_array = [[] for _ in inputs_array]
- if special: # to English using CamelCase Naming Convention
- sys_prompt_array = [f"Translate following names to English with CamelCase naming convention. Keep original format" for _ in inputs_array]
- else:
- sys_prompt_array = [f"Translate following sentences to {LANG}. E.g., You should translate sentences to the following format ['translation of sentence 1', 'translation of sentence 2']. Do NOT answer with Chinese!" for _ in inputs_array]
- chatbot = ChatBotWithCookies(llm_kwargs)
- gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array,
- inputs_show_user_array,
- llm_kwargs,
- chatbot,
- history_array,
- sys_prompt_array,
- )
- while True:
- try:
- gpt_say = next(gpt_say_generator)
- print(gpt_say[1][0][1])
- except StopIteration as e:
- result = e.value
- break
- translated_result = {}
- for i, r in enumerate(result):
- if i%2 == 1:
- try:
- res_before_trans = eval(result[i-1])
- res_after_trans = eval(result[i])
- if len(res_before_trans) != len(res_after_trans):
- raise RuntimeError
- for a,b in zip(res_before_trans, res_after_trans):
- translated_result[a] = b
- except:
- # try:
- # res_before_trans = word_to_translate_split[(i-1)//2]
- # res_after_trans = [s for s in result[i].split("', '")]
- # for a,b in zip(res_before_trans, res_after_trans):
- # translated_result[a] = b
- # except:
- print('GPT answers with unexpected format, some words may not be translated, but you can try again later to increase translation coverage.')
- res_before_trans = eval(result[i-1])
- for a in res_before_trans:
- translated_result[a] = None
- return translated_result
- def trans_json(word_to_translate, language, special=False):
- if len(word_to_translate) == 0: return {}
- from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
- from toolbox import get_conf, ChatBotWithCookies, load_chat_cookies
- cookies = load_chat_cookies()
- llm_kwargs = {
- 'api_key': cookies['api_key'],
- 'llm_model': cookies['llm_model'],
- 'top_p':1.0,
- 'max_length': None,
- 'temperature':0.4,
- }
- import random
- N_EACH_REQ = random.randint(16, 32)
- random.shuffle(word_to_translate)
- word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
- inputs_array = [{k:"#" for k in s} for s in word_to_translate_split]
- inputs_array = [ json.dumps(i, ensure_ascii=False) for i in inputs_array]
- inputs_show_user_array = inputs_array
- history_array = [[] for _ in inputs_array]
- sys_prompt_array = [TransPrompt for _ in inputs_array]
- chatbot = ChatBotWithCookies(llm_kwargs)
- gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array,
- inputs_show_user_array,
- llm_kwargs,
- chatbot,
- history_array,
- sys_prompt_array,
- )
- while True:
- try:
- gpt_say = next(gpt_say_generator)
- print(gpt_say[1][0][1])
- except StopIteration as e:
- result = e.value
- break
- translated_result = {}
- for i, r in enumerate(result):
- if i%2 == 1:
- try:
- translated_result.update(json.loads(result[i]))
- except:
- print(result[i])
- print(result)
- return translated_result
- def step_1_core_key_translate():
- LANG_STD = 'std'
- def extract_chinese_characters(file_path):
- syntax = []
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
- import ast
- root = ast.parse(content)
- for node in ast.walk(root):
- if isinstance(node, ast.Name):
- if contains_chinese(node.id): syntax.append(node.id)
- if isinstance(node, ast.Import):
- for n in node.names:
- if contains_chinese(n.name): syntax.append(n.name)
- elif isinstance(node, ast.ImportFrom):
- for n in node.names:
- if contains_chinese(n.name): syntax.append(n.name)
- # if node.module is None: print(node.module)
- for k in node.module.split('.'):
- if contains_chinese(k): syntax.append(k)
- return syntax
- def extract_chinese_characters_from_directory(directory_path):
- chinese_characters = []
- for root, dirs, files in os.walk(directory_path):
- if any([b in root for b in blacklist]):
- continue
- print(files)
- for file in files:
- if file.endswith('.py'):
- file_path = os.path.join(root, file)
- chinese_characters.extend(extract_chinese_characters(file_path))
- return chinese_characters
- directory_path = './'
- chinese_core_names = extract_chinese_characters_from_directory(directory_path)
- chinese_core_keys = [name for name in chinese_core_names]
- chinese_core_keys_norepeat = []
- for d in chinese_core_keys:
- if d not in chinese_core_keys_norepeat: chinese_core_keys_norepeat.append(d)
- need_translate = []
- cached_translation = read_map_from_json(language=LANG_STD)
- cached_translation_keys = list(cached_translation.keys())
- for d in chinese_core_keys_norepeat:
- if d not in cached_translation_keys:
- need_translate.append(d)
- if CACHE_ONLY:
- need_translate_mapping = {}
- else:
- need_translate_mapping = trans(need_translate, language=LANG_STD, special=True)
- map_to_json(need_translate_mapping, language=LANG_STD)
- cached_translation = read_map_from_json(language=LANG_STD)
- cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
- chinese_core_keys_norepeat_mapping = {}
- for k in chinese_core_keys_norepeat:
- chinese_core_keys_norepeat_mapping.update({k:cached_translation[k]})
- chinese_core_keys_norepeat_mapping = dict(sorted(chinese_core_keys_norepeat_mapping.items(), key=lambda x: -len(x[0])))
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- # copy
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- def copy_source_code():
- from toolbox import get_conf
- import shutil
- import os
- try: shutil.rmtree(f'./multi-language/{LANG}/')
- except: pass
- os.makedirs(f'./multi-language', exist_ok=True)
- backup_dir = f'./multi-language/{LANG}/'
- shutil.copytree('./', backup_dir, ignore=lambda x, y: blacklist)
- copy_source_code()
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- # primary key replace
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- directory_path = f'./multi-language/{LANG}/'
- for root, dirs, files in os.walk(directory_path):
- for file in files:
- if file.endswith('.py'):
- file_path = os.path.join(root, file)
- syntax = []
- # read again
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
- for k, v in chinese_core_keys_norepeat_mapping.items():
- content = content.replace(k, v)
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
- def step_2_core_key_translate():
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
- # step2
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
- def load_string(strings, string_input):
- string_ = string_input.strip().strip(',').strip().strip('.').strip()
- if string_.startswith('[Local Message]'):
- string_ = string_.replace('[Local Message]', '')
- string_ = string_.strip().strip(',').strip().strip('.').strip()
- splitted_string = [string_]
- # --------------------------------------
- splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="。", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=")", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="(", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="(", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=")", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="<", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=">", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="[", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="?", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="#", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="\n", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter=" ", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="- ", include_spliter=False)
- splitted_string = advanced_split(splitted_string, spliter="---", include_spliter=False)
- # --------------------------------------
- for j, s in enumerate(splitted_string): # .com
- if '.com' in s: continue
- if '\'' in s: continue
- if '\"' in s: continue
- strings.append([s,0])
- def get_strings(node):
- strings = []
- # recursively traverse the AST
- for child in ast.iter_child_nodes(node):
- node = child
- if isinstance(child, ast.Str):
- if contains_chinese(child.s):
- load_string(strings=strings, string_input=child.s)
- elif isinstance(child, ast.AST):
- strings.extend(get_strings(child))
- return strings
- string_literals = []
- directory_path = f'./multi-language/{LANG}/'
- for root, dirs, files in os.walk(directory_path):
- for file in files:
- if file.endswith('.py'):
- file_path = os.path.join(root, file)
- syntax = []
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
- # comments
- comments_arr = []
- for code_sp in content.splitlines():
- comments = re.findall(r'#.*$', code_sp)
- for comment in comments:
- load_string(strings=comments_arr, string_input=comment)
- string_literals.extend(comments_arr)
- # strings
- import ast
- tree = ast.parse(content)
- res = get_strings(tree, )
- string_literals.extend(res)
- [print(s) for s in string_literals]
- chinese_literal_names = []
- chinese_literal_names_norepeat = []
- for string, offset in string_literals:
- chinese_literal_names.append(string)
- chinese_literal_names_norepeat = []
- for d in chinese_literal_names:
- if d not in chinese_literal_names_norepeat: chinese_literal_names_norepeat.append(d)
- need_translate = []
- cached_translation = read_map_from_json(language=LANG)
- cached_translation_keys = list(cached_translation.keys())
- for d in chinese_literal_names_norepeat:
- if d not in cached_translation_keys:
- need_translate.append(d)
- if CACHE_ONLY:
- up = {}
- else:
- up = trans_json(need_translate, language=LANG, special=False)
- map_to_json(up, language=LANG)
- cached_translation = read_map_from_json(language=LANG)
- LANG_STD = 'std'
- cached_translation.update(read_map_from_json(language=LANG_STD))
- cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- # literal key replace
- # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- directory_path = f'./multi-language/{LANG}/'
- for root, dirs, files in os.walk(directory_path):
- for file in files:
- if file.endswith('.py'):
- file_path = os.path.join(root, file)
- syntax = []
- # read again
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
- for k, v in cached_translation.items():
- if v is None: continue
- if '"' in v:
- v = v.replace('"', "`")
- if '\'' in v:
- v = v.replace('\'', "`")
- content = content.replace(k, v)
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
- if file.strip('.py') in cached_translation:
- file_new = cached_translation[file.strip('.py')] + '.py'
- file_path_new = os.path.join(root, file_new)
- with open(file_path_new, 'w', encoding='utf-8') as f:
- f.write(content)
- os.remove(file_path)
- step_1_core_key_translate()
- step_2_core_key_translate()
- print('Finished, checkout generated results at ./multi-language/')
|