gpt-academic

Running

gpt-academic / multi_language.py

binary-husky

Update multi_language.py

3dd15de unverified over 2 years ago

22.1 kB

	"""
	Translate this project to other languages (experimental, please open an issue if there is any bug)


	Usage:
	1. modify LANG
	LANG = "English"

	2. modify TransPrompt
	TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."

	3. Run `python multi_language.py`.
	Note: You need to run it multiple times to increase translation coverage because GPT makes mistakes sometimes.

	4. Find the translated program in `multi-language\English\*`

	P.S.

	- The translation mapping will be stored in `docs/translation_xxxx.json`, you can revised mistaken translation there.

	- If you would like to share your `docs/translation_xxxx.json`, (so that everyone can use the cached & revised translation mapping), please open a Pull Request

	- If there is any translation error in `docs/translation_xxxx.json`, please open a Pull Request

	- Welcome any Pull Request, regardless of language
	"""

	import os
	import json
	import functools
	import re
	import pickle
	import time

	CACHE_FOLDER = "gpt_log"
	blacklist = ['multi-language', 'gpt_log', '.git', 'private_upload', 'multi_language.py']

	# LANG = "TraditionalChinese"
	# TransPrompt = f"Replace each json value `#` with translated results in Traditional Chinese, e.g., \"原始文本\":\"翻譯後文字\". Keep Json format. Do not answer #."

	# LANG = "Japanese"
	# TransPrompt = f"Replace each json value `#` with translated results in Japanese, e.g., \"原始文本\":\"テキストの翻訳\". Keep Json format. Do not answer #."

	LANG = "English"
	TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."


	if not os.path.exists(CACHE_FOLDER):
	os.makedirs(CACHE_FOLDER)


	def lru_file_cache(maxsize=128, ttl=None, filename=None):
	"""
	Decorator that caches a function's return value after being called with given arguments.
	It uses a Least Recently Used (LRU) cache strategy to limit the size of the cache.
	maxsize: Maximum size of the cache. Defaults to 128.
	ttl: Time-to-Live of the cache. If a value hasn't been accessed for `ttl` seconds, it will be evicted from the cache.
	filename: Name of the file to store the cache in. If not supplied, the function name + ".cache" will be used.
	"""
	cache_path = os.path.join(CACHE_FOLDER, f"{filename}.cache") if filename is not None else None

	def decorator_function(func):
	cache = {}
	_cache_info = {
	"hits": 0,
	"misses": 0,
	"maxsize": maxsize,
	"currsize": 0,
	"ttl": ttl,
	"filename": cache_path,
	}

	@functools.wraps(func)
	def wrapper_function(args, *kwargs):
	key = str((args, frozenset(kwargs)))
	if key in cache:
	if _cache_info["ttl"] is None or (cache[key][1] + _cache_info["ttl"]) >= time.time():
	_cache_info["hits"] += 1
	print(f'Warning, reading cache, last read {(time.time()-cache[key][1])//60} minutes ago'); time.sleep(2)
	cache[key][1] = time.time()
	return cache[key][0]
	else:
	del cache[key]

	result = func(args, *kwargs)
	cache[key] = [result, time.time()]
	_cache_info["misses"] += 1
	_cache_info["currsize"] += 1

	if _cache_info["currsize"] > _cache_info["maxsize"]:
	oldest_key = None
	for k in cache:
	if oldest_key is None:
	oldest_key = k
	elif cache[k][1] < cache[oldest_key][1]:
	oldest_key = k
	del cache[oldest_key]
	_cache_info["currsize"] -= 1

	if cache_path is not None:
	with open(cache_path, "wb") as f:
	pickle.dump(cache, f)

	return result

	def cache_info():
	return _cache_info

	wrapper_function.cache_info = cache_info

	if cache_path is not None and os.path.exists(cache_path):
	with open(cache_path, "rb") as f:
	cache = pickle.load(f)
	_cache_info["currsize"] = len(cache)

	return wrapper_function

	return decorator_function

	def contains_chinese(string):
	"""
	Returns True if the given string contains Chinese characters, False otherwise.
	"""
	chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
	return chinese_regex.search(string) is not None

	def split_list(lst, n_each_req):
	"""
	Split a list into smaller lists, each with a maximum number of elements.
	:param lst: the list to split
	:param n_each_req: the maximum number of elements in each sub-list
	:return: a list of sub-lists
	"""
	result = []
	for i in range(0, len(lst), n_each_req):
	result.append(lst[i:i + n_each_req])
	return result

	def map_to_json(map, language):
	dict_ = read_map_from_json(language)
	dict_.update(map)
	with open(f'docs/translate_{language.lower()}.json', 'w', encoding='utf8') as f:
	json.dump(dict_, f, indent=4, ensure_ascii=False)

	def read_map_from_json(language):
	if os.path.exists(f'docs/translate_{language.lower()}.json'):
	with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
	res = json.load(f)
	res = {k:v for k, v in res.items() if v is not None and contains_chinese(k)}
	return res
	return {}

	def advanced_split(splitted_string, spliter, include_spliter=False):
	splitted_string_tmp = []
	for string_ in splitted_string:
	if spliter in string_:
	splitted = string_.split(spliter)
	for i, s in enumerate(splitted):
	if include_spliter:
	if i != len(splitted)-1:
	splitted[i] += spliter
	splitted[i] = splitted[i].strip()
	for i in reversed(range(len(splitted))):
	if not contains_chinese(splitted[i]):
	splitted.pop(i)
	splitted_string_tmp.extend(splitted)
	else:
	splitted_string_tmp.append(string_)
	splitted_string = splitted_string_tmp
	return splitted_string_tmp

	cached_translation = {}
	cached_translation = read_map_from_json(language=LANG)

	def trans(word_to_translate, language, special=False):
	if len(word_to_translate) == 0: return {}
	from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
	from toolbox import get_conf, ChatBotWithCookies
	proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
	get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
	llm_kwargs = {
	'api_key': API_KEY,
	'llm_model': LLM_MODEL,
	'top_p':1.0,
	'max_length': None,
	'temperature':0.4,
	}
	import random
	N_EACH_REQ = random.randint(16, 32)
	word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
	inputs_array = [str(s) for s in word_to_translate_split]
	inputs_show_user_array = inputs_array
	history_array = [[] for _ in inputs_array]
	if special: # to English using CamelCase Naming Convention
	sys_prompt_array = [f"Translate following names to English with CamelCase naming convention. Keep original format" for _ in inputs_array]
	else:
	sys_prompt_array = [f"Translate following sentences to {LANG}. E.g., You should translate sentences to the following format ['translation of sentence 1', 'translation of sentence 2']. Do NOT answer with Chinese!" for _ in inputs_array]
	chatbot = ChatBotWithCookies(llm_kwargs)
	gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
	inputs_array,
	inputs_show_user_array,
	llm_kwargs,
	chatbot,
	history_array,
	sys_prompt_array,
	)
	while True:
	try:
	gpt_say = next(gpt_say_generator)
	print(gpt_say[1][0][1])
	except StopIteration as e:
	result = e.value
	break
	translated_result = {}
	for i, r in enumerate(result):
	if i%2 == 1:
	try:
	res_before_trans = eval(result[i-1])
	res_after_trans = eval(result[i])
	if len(res_before_trans) != len(res_after_trans):
	raise RuntimeError
	for a,b in zip(res_before_trans, res_after_trans):
	translated_result[a] = b
	except:
	# try:
	# res_before_trans = word_to_translate_split[(i-1)//2]
	# res_after_trans = [s for s in result[i].split("', '")]
	# for a,b in zip(res_before_trans, res_after_trans):
	# translated_result[a] = b
	# except:
	print('GPT answers with unexpected format, some words may not be translated, but you can try again later to increase translation coverage.')
	res_before_trans = eval(result[i-1])
	for a in res_before_trans:
	translated_result[a] = None
	return translated_result


	def trans_json(word_to_translate, language, special=False):
	if len(word_to_translate) == 0: return {}
	from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
	from toolbox import get_conf, ChatBotWithCookies
	proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
	get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
	llm_kwargs = {
	'api_key': API_KEY,
	'llm_model': LLM_MODEL,
	'top_p':1.0,
	'max_length': None,
	'temperature':0.1,
	}
	import random
	N_EACH_REQ = random.randint(16, 32)
	random.shuffle(word_to_translate)
	word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
	inputs_array = [{k:"#" for k in s} for s in word_to_translate_split]
	inputs_array = [ json.dumps(i, ensure_ascii=False) for i in inputs_array]

	inputs_show_user_array = inputs_array
	history_array = [[] for _ in inputs_array]
	sys_prompt_array = [TransPrompt for _ in inputs_array]
	chatbot = ChatBotWithCookies(llm_kwargs)
	gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
	inputs_array,
	inputs_show_user_array,
	llm_kwargs,
	chatbot,
	history_array,
	sys_prompt_array,
	)
	while True:
	try:
	gpt_say = next(gpt_say_generator)
	print(gpt_say[1][0][1])
	except StopIteration as e:
	result = e.value
	break
	translated_result = {}
	for i, r in enumerate(result):
	if i%2 == 1:
	try:
	translated_result.update(json.loads(result[i]))
	except:
	print(result[i])
	print(result)
	return translated_result


	def step_1_core_key_translate():
	def extract_chinese_characters(file_path):
	syntax = []
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	import ast
	root = ast.parse(content)
	for node in ast.walk(root):
	if isinstance(node, ast.Name):
	if contains_chinese(node.id): syntax.append(node.id)
	if isinstance(node, ast.Import):
	for n in node.names:
	if contains_chinese(n.name): syntax.append(n.name)
	elif isinstance(node, ast.ImportFrom):
	for n in node.names:
	if contains_chinese(n.name): syntax.append(n.name)
	for k in node.module.split('.'):
	if contains_chinese(k): syntax.append(k)
	return syntax

	def extract_chinese_characters_from_directory(directory_path):
	chinese_characters = []
	for root, dirs, files in os.walk(directory_path):
	if any([b in root for b in blacklist]):
	continue
	for file in files:
	if file.endswith('.py'):
	file_path = os.path.join(root, file)
	chinese_characters.extend(extract_chinese_characters(file_path))
	return chinese_characters

	directory_path = './'
	chinese_core_names = extract_chinese_characters_from_directory(directory_path)
	chinese_core_keys = [name for name in chinese_core_names]
	chinese_core_keys_norepeat = []
	for d in chinese_core_keys:
	if d not in chinese_core_keys_norepeat: chinese_core_keys_norepeat.append(d)
	need_translate = []
	cached_translation = read_map_from_json(language=LANG)
	cached_translation_keys = list(cached_translation.keys())
	for d in chinese_core_keys_norepeat:
	if d not in cached_translation_keys:
	need_translate.append(d)

	need_translate_mapping = trans(need_translate, language=LANG, special=True)
	map_to_json(need_translate_mapping, language=LANG)
	cached_translation = read_map_from_json(language=LANG)
	cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))

	chinese_core_keys_norepeat_mapping = {}
	for k in chinese_core_keys_norepeat:
	chinese_core_keys_norepeat_mapping.update({k:cached_translation[k]})
	chinese_core_keys_norepeat_mapping = dict(sorted(chinese_core_keys_norepeat_mapping.items(), key=lambda x: -len(x[0])))

	# ===============================================
	# copy
	# ===============================================
	def copy_source_code():

	from toolbox import get_conf
	import shutil
	import os
	try: shutil.rmtree(f'./multi-language/{LANG}/')
	except: pass
	os.makedirs(f'./multi-language', exist_ok=True)
	backup_dir = f'./multi-language/{LANG}/'
	shutil.copytree('./', backup_dir, ignore=lambda x, y: blacklist)
	copy_source_code()

	# ===============================================
	# primary key replace
	# ===============================================
	directory_path = f'./multi-language/{LANG}/'
	for root, dirs, files in os.walk(directory_path):
	for file in files:
	if file.endswith('.py'):
	file_path = os.path.join(root, file)
	syntax = []
	# read again
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	for k, v in chinese_core_keys_norepeat_mapping.items():
	content = content.replace(k, v)

	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(content)


	def step_2_core_key_translate():

	# =================================================================================================
	# step2
	# =================================================================================================

	def load_string(strings, string_input):
	string_ = string_input.strip().strip(',').strip().strip('.').strip()
	if string_.startswith('[Local Message]'):
	string_ = string_.replace('[Local Message]', '')
	string_ = string_.strip().strip(',').strip().strip('.').strip()
	splitted_string = [string_]
	# --------------------------------------
	splitted_string = advanced_split(splitted_string, spliter="，", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="。", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="）", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="（", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="(", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter=")", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="<", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter=">", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="[", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="？", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="：", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="#", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="\n", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter=" ", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="- ", include_spliter=False)
	splitted_string = advanced_split(splitted_string, spliter="---", include_spliter=False)

	# --------------------------------------
	for j, s in enumerate(splitted_string): # .com
	if '.com' in s: continue
	if '\'' in s: continue
	if '\"' in s: continue
	strings.append([s,0])


	def get_strings(node):
	strings = []
	# recursively traverse the AST
	for child in ast.iter_child_nodes(node):
	node = child
	if isinstance(child, ast.Str):
	if contains_chinese(child.s):
	load_string(strings=strings, string_input=child.s)
	elif isinstance(child, ast.AST):
	strings.extend(get_strings(child))
	return strings

	string_literals = []
	directory_path = f'./multi-language/{LANG}/'
	for root, dirs, files in os.walk(directory_path):
	for file in files:
	if file.endswith('.py'):
	file_path = os.path.join(root, file)
	syntax = []
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	# comments
	comments_arr = []
	for code_sp in content.splitlines():
	comments = re.findall(r'#.*$', code_sp)
	for comment in comments:
	load_string(strings=comments_arr, string_input=comment)
	string_literals.extend(comments_arr)

	# strings
	import ast
	tree = ast.parse(content)
	res = get_strings(tree, )
	string_literals.extend(res)

	[print(s) for s in string_literals]
	chinese_literal_names = []
	chinese_literal_names_norepeat = []
	for string, offset in string_literals:
	chinese_literal_names.append(string)
	chinese_literal_names_norepeat = []
	for d in chinese_literal_names:
	if d not in chinese_literal_names_norepeat: chinese_literal_names_norepeat.append(d)
	need_translate = []
	cached_translation = read_map_from_json(language=LANG)
	cached_translation_keys = list(cached_translation.keys())
	for d in chinese_literal_names_norepeat:
	if d not in cached_translation_keys:
	need_translate.append(d)


	up = trans_json(need_translate, language=LANG, special=False)
	map_to_json(up, language=LANG)
	cached_translation = read_map_from_json(language=LANG)
	cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))

	# ===============================================
	# literal key replace
	# ===============================================
	directory_path = f'./multi-language/{LANG}/'
	for root, dirs, files in os.walk(directory_path):
	for file in files:
	if file.endswith('.py'):
	file_path = os.path.join(root, file)
	syntax = []
	# read again
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	for k, v in cached_translation.items():
	if v is None: continue
	if '"' in v:
	v = v.replace('"', "`")
	if '\'' in v:
	v = v.replace('\'', "`")
	content = content.replace(k, v)

	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(content)

	if file.strip('.py') in cached_translation:
	file_new = cached_translation[file.strip('.py')] + '.py'
	file_path_new = os.path.join(root, file_new)
	with open(file_path_new, 'w', encoding='utf-8') as f:
	f.write(content)
	os.remove(file_path)

	step_1_core_key_translate()
	step_2_core_key_translate()