Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Json is good
Browse files- crazy_functions/批量总结PDF文档.py +2 -2
- docs/translate_english.json +0 -0
- multi_language.py +59 -2
    	
        crazy_functions/批量总结PDF文档.py
    CHANGED
    
    | @@ -41,8 +41,8 @@ def clean_text(raw_text): | |
| 41 | 
             
                """
         | 
| 42 | 
             
                对从 PDF 提取出的原始文本进行清洗和格式化处理。
         | 
| 43 | 
             
                1. 对原始文本进行归一化处理。
         | 
| 44 | 
            -
                2.  | 
| 45 | 
            -
                3. 根据 heuristic  | 
| 46 | 
             
                """
         | 
| 47 | 
             
                # 对文本进行归一化处理
         | 
| 48 | 
             
                normalized_text = normalize_text(raw_text)
         | 
|  | |
| 41 | 
             
                """
         | 
| 42 | 
             
                对从 PDF 提取出的原始文本进行清洗和格式化处理。
         | 
| 43 | 
             
                1. 对原始文本进行归一化处理。
         | 
| 44 | 
            +
                2. 替换跨行的连词
         | 
| 45 | 
            +
                3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换
         | 
| 46 | 
             
                """
         | 
| 47 | 
             
                # 对文本进行归一化处理
         | 
| 48 | 
             
                normalized_text = normalize_text(raw_text)
         | 
    	
        docs/translate_english.json
    CHANGED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        multi_language.py
    CHANGED
    
    | @@ -110,7 +110,7 @@ def read_map_from_json(language): | |
| 110 | 
             
                if os.path.exists(f'docs/translate_{language.lower()}.json'):
         | 
| 111 | 
             
                    with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f: 
         | 
| 112 | 
             
                        res = json.load(f)
         | 
| 113 | 
            -
                        res = {k:v for k, v in res.items() if v is not None}
         | 
| 114 | 
             
                        return res
         | 
| 115 | 
             
                return {}
         | 
| 116 |  | 
| @@ -181,6 +181,8 @@ def trans(word_to_translate, language, special=False): | |
| 181 | 
             
                        try:
         | 
| 182 | 
             
                            res_before_trans = eval(result[i-1])
         | 
| 183 | 
             
                            res_after_trans = eval(result[i])
         | 
|  | |
|  | |
| 184 | 
             
                            for a,b in zip(res_before_trans, res_after_trans):
         | 
| 185 | 
             
                                translated_result[a] = b
         | 
| 186 | 
             
                        except:
         | 
| @@ -196,6 +198,57 @@ def trans(word_to_translate, language, special=False): | |
| 196 | 
             
                                translated_result[a] = None
         | 
| 197 | 
             
                return translated_result
         | 
| 198 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 199 | 
             
            def step_1_core_key_translate():
         | 
| 200 | 
             
                def extract_chinese_characters(file_path):
         | 
| 201 | 
             
                    syntax = []
         | 
| @@ -310,6 +363,7 @@ def step_2_core_key_translate(): | |
| 310 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
         | 
| 311 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
         | 
| 312 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
         | 
|  | |
| 313 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
         | 
| 314 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
         | 
| 315 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
         | 
| @@ -318,6 +372,9 @@ def step_2_core_key_translate(): | |
| 318 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
         | 
| 319 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
         | 
| 320 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="   ", include_spliter=False)
         | 
|  | |
|  | |
|  | |
| 321 | 
             
                    # --------------------------------------
         | 
| 322 | 
             
                    for j, s in enumerate(splitted_string): # .com
         | 
| 323 | 
             
                        if '.com' in s: continue
         | 
| @@ -377,7 +434,7 @@ def step_2_core_key_translate(): | |
| 377 | 
             
                        need_translate.append(d)
         | 
| 378 |  | 
| 379 |  | 
| 380 | 
            -
                up =  | 
| 381 | 
             
                map_to_json(up, language=LANG)
         | 
| 382 | 
             
                cached_translation = read_map_from_json(language=LANG)
         | 
| 383 | 
             
                cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
         | 
|  | |
| 110 | 
             
                if os.path.exists(f'docs/translate_{language.lower()}.json'):
         | 
| 111 | 
             
                    with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f: 
         | 
| 112 | 
             
                        res = json.load(f)
         | 
| 113 | 
            +
                        res = {k:v for k, v in res.items() if v is not None and contains_chinese(k)}
         | 
| 114 | 
             
                        return res
         | 
| 115 | 
             
                return {}
         | 
| 116 |  | 
|  | |
| 181 | 
             
                        try:
         | 
| 182 | 
             
                            res_before_trans = eval(result[i-1])
         | 
| 183 | 
             
                            res_after_trans = eval(result[i])
         | 
| 184 | 
            +
                            if len(res_before_trans) != len(res_after_trans): 
         | 
| 185 | 
            +
                                raise RuntimeError
         | 
| 186 | 
             
                            for a,b in zip(res_before_trans, res_after_trans):
         | 
| 187 | 
             
                                translated_result[a] = b
         | 
| 188 | 
             
                        except:
         | 
|  | |
| 198 | 
             
                                translated_result[a] = None
         | 
| 199 | 
             
                return translated_result
         | 
| 200 |  | 
| 201 | 
            +
             | 
| 202 | 
            +
            def trans_json(word_to_translate, language, special=False):
         | 
| 203 | 
            +
                if len(word_to_translate) == 0: return {}
         | 
| 204 | 
            +
                from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
         | 
| 205 | 
            +
                from toolbox import get_conf, ChatBotWithCookies
         | 
| 206 | 
            +
                proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
         | 
| 207 | 
            +
                    get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
         | 
| 208 | 
            +
                llm_kwargs = {
         | 
| 209 | 
            +
                    'api_key': API_KEY,
         | 
| 210 | 
            +
                    'llm_model': LLM_MODEL,
         | 
| 211 | 
            +
                    'top_p':1.0, 
         | 
| 212 | 
            +
                    'max_length': None,
         | 
| 213 | 
            +
                    'temperature':0.1,
         | 
| 214 | 
            +
                }
         | 
| 215 | 
            +
                import random
         | 
| 216 | 
            +
                N_EACH_REQ = random.randint(16, 32)
         | 
| 217 | 
            +
                random.shuffle(word_to_translate)
         | 
| 218 | 
            +
                word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
         | 
| 219 | 
            +
                inputs_array = [{k:"#" for k in s} for s in word_to_translate_split]
         | 
| 220 | 
            +
                inputs_array = [ json.dumps(i, ensure_ascii=False)  for i in inputs_array]
         | 
| 221 | 
            +
                
         | 
| 222 | 
            +
                inputs_show_user_array = inputs_array
         | 
| 223 | 
            +
                history_array = [[] for _ in inputs_array]
         | 
| 224 | 
            +
                sys_prompt_array = [f"Replace each json value `#` with translated results in {LANG}, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #." for _ in inputs_array]
         | 
| 225 | 
            +
                chatbot = ChatBotWithCookies(llm_kwargs)
         | 
| 226 | 
            +
                gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
         | 
| 227 | 
            +
                    inputs_array, 
         | 
| 228 | 
            +
                    inputs_show_user_array, 
         | 
| 229 | 
            +
                    llm_kwargs, 
         | 
| 230 | 
            +
                    chatbot, 
         | 
| 231 | 
            +
                    history_array, 
         | 
| 232 | 
            +
                    sys_prompt_array, 
         | 
| 233 | 
            +
                )
         | 
| 234 | 
            +
                while True:
         | 
| 235 | 
            +
                    try:
         | 
| 236 | 
            +
                        gpt_say = next(gpt_say_generator)
         | 
| 237 | 
            +
                        print(gpt_say[1][0][1])
         | 
| 238 | 
            +
                    except StopIteration as e:
         | 
| 239 | 
            +
                        result = e.value
         | 
| 240 | 
            +
                        break
         | 
| 241 | 
            +
                translated_result = {}
         | 
| 242 | 
            +
                for i, r in enumerate(result):
         | 
| 243 | 
            +
                    if i%2 == 1:
         | 
| 244 | 
            +
                        try:
         | 
| 245 | 
            +
                            translated_result.update(json.loads(result[i]))
         | 
| 246 | 
            +
                        except:
         | 
| 247 | 
            +
                            print(result[i])
         | 
| 248 | 
            +
                print(result)
         | 
| 249 | 
            +
                return translated_result
         | 
| 250 | 
            +
             | 
| 251 | 
            +
             | 
| 252 | 
             
            def step_1_core_key_translate():
         | 
| 253 | 
             
                def extract_chinese_characters(file_path):
         | 
| 254 | 
             
                    syntax = []
         | 
|  | |
| 363 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
         | 
| 364 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
         | 
| 365 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
         | 
| 366 | 
            +
                    splitted_string = advanced_split(splitted_string, spliter="?", include_spliter=False)
         | 
| 367 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
         | 
| 368 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
         | 
| 369 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
         | 
|  | |
| 372 | 
             
                    splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
         | 
| 373 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
         | 
| 374 | 
             
                    splitted_string = advanced_split(splitted_string, spliter="   ", include_spliter=False)
         | 
| 375 | 
            +
                    splitted_string = advanced_split(splitted_string, spliter="- ", include_spliter=False)
         | 
| 376 | 
            +
                    splitted_string = advanced_split(splitted_string, spliter="---", include_spliter=False)
         | 
| 377 | 
            +
                    
         | 
| 378 | 
             
                    # --------------------------------------
         | 
| 379 | 
             
                    for j, s in enumerate(splitted_string): # .com
         | 
| 380 | 
             
                        if '.com' in s: continue
         | 
|  | |
| 434 | 
             
                        need_translate.append(d)
         | 
| 435 |  | 
| 436 |  | 
| 437 | 
            +
                up = trans_json(need_translate, language=LANG, special=False)
         | 
| 438 | 
             
                map_to_json(up, language=LANG)
         | 
| 439 | 
             
                cached_translation = read_map_from_json(language=LANG)
         | 
| 440 | 
             
                cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
         |