Spaces:
Paused
Paused
修复完成后的文件显示问题
Browse files- crazy_functions/crazy_utils.py +73 -0
- crazy_functions/批量翻译PDF文档_多线程.py +33 -124
crazy_functions/crazy_utils.py
CHANGED
|
@@ -1,6 +1,79 @@
|
|
| 1 |
|
| 2 |
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
| 5 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 6 |
if get_token_fn(txt_tocut) <= limit:
|
|
|
|
| 1 |
|
| 2 |
|
| 3 |
|
| 4 |
+
def request_gpt_model_in_new_thread_with_ui_alive(inputs, inputs_show_user, top_p, temperature, chatbot, history, sys_prompt, refresh_interval=0.2):
|
| 5 |
+
import time
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 7 |
+
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 8 |
+
# 用户反馈
|
| 9 |
+
chatbot.append([inputs_show_user, ""]); msg = '正常'
|
| 10 |
+
yield chatbot, [], msg
|
| 11 |
+
executor = ThreadPoolExecutor(max_workers=16)
|
| 12 |
+
mutable = ["", time.time()]
|
| 13 |
+
future = executor.submit(lambda:
|
| 14 |
+
predict_no_ui_long_connection(inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable)
|
| 15 |
+
)
|
| 16 |
+
while True:
|
| 17 |
+
# yield一次以刷新前端页面
|
| 18 |
+
time.sleep(refresh_interval)
|
| 19 |
+
# “喂狗”(看门狗)
|
| 20 |
+
mutable[1] = time.time()
|
| 21 |
+
if future.done(): break
|
| 22 |
+
chatbot[-1] = [chatbot[-1][0], mutable[0]]; msg = "正常"
|
| 23 |
+
yield chatbot, [], msg
|
| 24 |
+
return future.result()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(inputs_array, inputs_show_user_array, top_p, temperature, chatbot, history_array, sys_prompt_array, refresh_interval=0.2, max_workers=10, scroller_max_len=30):
|
| 30 |
+
import time
|
| 31 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 32 |
+
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 33 |
+
assert len(inputs_array) == len(history_array)
|
| 34 |
+
assert len(inputs_array) == len(sys_prompt_array)
|
| 35 |
+
executor = ThreadPoolExecutor(max_workers=max_workers)
|
| 36 |
+
n_frag = len(inputs_array)
|
| 37 |
+
# 用户反馈
|
| 38 |
+
chatbot.append(["请开始多线程操作。", ""]); msg = '正常'
|
| 39 |
+
yield chatbot, [], msg
|
| 40 |
+
# 异步原子
|
| 41 |
+
mutable = [["", time.time()] for _ in range(n_frag)]
|
| 42 |
+
def _req_gpt(index, inputs, history, sys_prompt):
|
| 43 |
+
gpt_say = predict_no_ui_long_connection(
|
| 44 |
+
inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable[index]
|
| 45 |
+
)
|
| 46 |
+
return gpt_say
|
| 47 |
+
# 异步任务开始
|
| 48 |
+
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
|
| 49 |
+
cnt = 0
|
| 50 |
+
while True:
|
| 51 |
+
# yield一次以刷新前端页面
|
| 52 |
+
time.sleep(refresh_interval); cnt += 1
|
| 53 |
+
worker_done = [h.done() for h in futures]
|
| 54 |
+
if all(worker_done): executor.shutdown(); break
|
| 55 |
+
# 更好的UI视觉效果
|
| 56 |
+
observe_win = []
|
| 57 |
+
# 每个线程都要“喂狗”(看门狗)
|
| 58 |
+
for thread_index, _ in enumerate(worker_done): mutable[thread_index][1] = time.time()
|
| 59 |
+
# 在前端打印些好玩的东西
|
| 60 |
+
for thread_index, _ in enumerate(worker_done):
|
| 61 |
+
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
|
| 62 |
+
replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
|
| 63 |
+
observe_win.append(print_something_really_funny)
|
| 64 |
+
stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
|
| 65 |
+
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
|
| 66 |
+
yield chatbot, [], msg
|
| 67 |
+
# 异步任务结束
|
| 68 |
+
gpt_response_collection = []
|
| 69 |
+
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
| 70 |
+
gpt_res = f.result()
|
| 71 |
+
gpt_response_collection.extend([inputs_show_user, gpt_res])
|
| 72 |
+
return gpt_response_collection
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
| 78 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 79 |
if get_token_fn(txt_tocut) <= limit:
|
crazy_functions/批量翻译PDF文档_多线程.py
CHANGED
|
@@ -1,66 +1,25 @@
|
|
| 1 |
-
from toolbox import CatchException, report_execption, write_results_to_file
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
|
| 5 |
-
|
| 6 |
-
def is_paragraph_break(match):
|
| 7 |
-
"""
|
| 8 |
-
根据给定的匹配结果来判断换行符是否表示段落分隔。
|
| 9 |
-
如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
|
| 10 |
-
也可以根据之前的内容长度来判断段落是否已经足够长。
|
| 11 |
-
"""
|
| 12 |
-
prev_char, next_char = match.groups()
|
| 13 |
-
|
| 14 |
-
# 句子结束标志
|
| 15 |
-
sentence_endings = ".!?"
|
| 16 |
-
|
| 17 |
-
# 设定一个最小段落长度阈值
|
| 18 |
-
min_paragraph_length = 140
|
| 19 |
-
|
| 20 |
-
if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
|
| 21 |
-
return "\n\n"
|
| 22 |
-
else:
|
| 23 |
-
return " "
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def normalize_text(text):
|
| 27 |
-
"""
|
| 28 |
-
通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
|
| 29 |
-
例如,将连字 "fi" 转换为 "f" 和 "i"。
|
| 30 |
-
"""
|
| 31 |
-
# 对文本进行归一化处理,分解连字
|
| 32 |
-
normalized_text = unicodedata.normalize("NFKD", text)
|
| 33 |
-
|
| 34 |
-
# 替换其他特殊字符
|
| 35 |
-
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
| 36 |
-
|
| 37 |
-
return cleaned_text
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def clean_text(raw_text):
|
| 41 |
"""
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"""
|
| 47 |
-
# 对文本进行归一化处理
|
| 48 |
-
normalized_text = normalize_text(raw_text)
|
| 49 |
-
|
| 50 |
-
# 替换跨行的连词
|
| 51 |
-
text = re.sub(r'(\w+-\n\w+)',
|
| 52 |
-
lambda m: m.group(1).replace('-\n', ''), normalized_text)
|
| 53 |
-
|
| 54 |
-
# 根据前后相邻字符的特点,找到原文本中的换行符
|
| 55 |
-
newlines = re.compile(r'(\S)\n(\S)')
|
| 56 |
-
|
| 57 |
-
# 根据 heuristic 规则,用空格或段落分隔符替换原换行符
|
| 58 |
-
final_text = re.sub(newlines, lambda m: m.group(
|
| 59 |
-
1) + is_paragraph_break(m) + m.group(2), text)
|
| 60 |
-
|
| 61 |
-
return final_text.strip()
|
| 62 |
-
|
| 63 |
-
def read_and_clean_pdf_text(fp):
|
| 64 |
import fitz, re
|
| 65 |
import numpy as np
|
| 66 |
# file_content = ""
|
|
@@ -170,69 +129,7 @@ def 批量翻译PDF文档(txt, top_p, temperature, chatbot, history, sys_prompt,
|
|
| 170 |
yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt)
|
| 171 |
|
| 172 |
|
| 173 |
-
def request_gpt_model_in_new_thread_with_ui_alive(inputs, inputs_show_user, top_p, temperature, chatbot, history, sys_prompt, refresh_interval=0.2):
|
| 174 |
-
import time
|
| 175 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 176 |
-
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 177 |
-
# 用户反馈
|
| 178 |
-
chatbot.append([inputs_show_user, ""]); msg = '正常'
|
| 179 |
-
yield chatbot, [], msg
|
| 180 |
-
executor = ThreadPoolExecutor(max_workers=16)
|
| 181 |
-
mutable = ["", time.time()]
|
| 182 |
-
future = executor.submit(lambda:
|
| 183 |
-
predict_no_ui_long_connection(inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable)
|
| 184 |
-
)
|
| 185 |
-
while True:
|
| 186 |
-
# yield一次以刷新前端页面
|
| 187 |
-
time.sleep(refresh_interval)
|
| 188 |
-
# “喂狗”(看门狗)
|
| 189 |
-
mutable[1] = time.time()
|
| 190 |
-
if future.done(): break
|
| 191 |
-
chatbot[-1] = [chatbot[-1][0], mutable[0]]; msg = "正常"
|
| 192 |
-
yield chatbot, [], msg
|
| 193 |
-
return future.result()
|
| 194 |
|
| 195 |
-
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(inputs_array, inputs_show_user_array, top_p, temperature, chatbot, history_array, sys_prompt_array, refresh_interval=0.2, max_workers=10, scroller_max_len=30):
|
| 196 |
-
import time
|
| 197 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 198 |
-
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 199 |
-
assert len(inputs_array) == len(history_array)
|
| 200 |
-
assert len(inputs_array) == len(sys_prompt_array)
|
| 201 |
-
executor = ThreadPoolExecutor(max_workers=max_workers)
|
| 202 |
-
n_frag = len(inputs_array)
|
| 203 |
-
# 异步原子
|
| 204 |
-
mutable = [["", time.time()] for _ in range(n_frag)]
|
| 205 |
-
def _req_gpt(index, inputs, history, sys_prompt):
|
| 206 |
-
gpt_say = predict_no_ui_long_connection(
|
| 207 |
-
inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable[index]
|
| 208 |
-
)
|
| 209 |
-
return gpt_say
|
| 210 |
-
# 异步任务开始
|
| 211 |
-
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
|
| 212 |
-
cnt = 0
|
| 213 |
-
while True:
|
| 214 |
-
# yield一次以刷新前端页面
|
| 215 |
-
time.sleep(refresh_interval); cnt += 1
|
| 216 |
-
worker_done = [h.done() for h in futures]
|
| 217 |
-
if all(worker_done): executor.shutdown(); break
|
| 218 |
-
# 更好的UI视觉效果
|
| 219 |
-
observe_win = []
|
| 220 |
-
# 每个线程都要“喂狗”(看门狗)
|
| 221 |
-
for thread_index, _ in enumerate(worker_done): mutable[thread_index][1] = time.time()
|
| 222 |
-
# 在前端打印些好玩的东西
|
| 223 |
-
for thread_index, _ in enumerate(worker_done):
|
| 224 |
-
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
|
| 225 |
-
replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
|
| 226 |
-
observe_win.append(print_something_really_funny)
|
| 227 |
-
stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
|
| 228 |
-
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
|
| 229 |
-
yield chatbot, [], msg
|
| 230 |
-
# 异步任务结束
|
| 231 |
-
gpt_response_collection = []
|
| 232 |
-
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
| 233 |
-
gpt_res = f.result()
|
| 234 |
-
gpt_response_collection.extend([inputs_show_user, gpt_res])
|
| 235 |
-
return gpt_response_collection
|
| 236 |
|
| 237 |
def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt):
|
| 238 |
import time
|
|
@@ -241,7 +138,7 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor
|
|
| 241 |
import fitz
|
| 242 |
import tiktoken
|
| 243 |
TOKEN_LIMIT_PER_FRAGMENT = 1600
|
| 244 |
-
|
| 245 |
for index, fp in enumerate(file_manifest):
|
| 246 |
# 读取PDF文件
|
| 247 |
file_content, page_one = read_and_clean_pdf_text(fp)
|
|
@@ -277,7 +174,19 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor
|
|
| 277 |
|
| 278 |
final = ["", paper_meta_info + '\n\n---\n\n---\n\n---\n\n']
|
| 279 |
final.extend(gpt_response_collection)
|
| 280 |
-
|
|
|
|
|
|
|
| 281 |
chatbot.append((f"{fp}完成了吗?", res)); msg = "完成"
|
| 282 |
yield chatbot, history, msg
|
| 283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from toolbox import CatchException, report_execption, write_results_to_file
|
| 2 |
+
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
| 3 |
+
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
| 4 |
|
| 5 |
+
def read_and_clean_pdf_text(fp):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
+
**输入参数说明**
|
| 8 |
+
- `fp`:需要读取和清理文本的pdf文件路径
|
| 9 |
+
|
| 10 |
+
**输出参数说明**
|
| 11 |
+
- `meta_txt`:清理后的文本内容字符串
|
| 12 |
+
- `page_one_meta`:第一页清理后的文本内容列表
|
| 13 |
+
|
| 14 |
+
**函数功能**
|
| 15 |
+
读取pdf文件并清理其中的文本内容,清理规则包括:
|
| 16 |
+
- 提取所有块元的文本信息,并合并为一个字符串
|
| 17 |
+
- 去除短块(字符数小于100)并替换为回车符
|
| 18 |
+
- 清理多余的空行
|
| 19 |
+
- 合并小写字母开头的段落块并替换为空格
|
| 20 |
+
- 清除重复的换行
|
| 21 |
+
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
|
| 22 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
import fitz, re
|
| 24 |
import numpy as np
|
| 25 |
# file_content = ""
|
|
|
|
| 129 |
yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt)
|
| 130 |
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt):
|
| 135 |
import time
|
|
|
|
| 138 |
import fitz
|
| 139 |
import tiktoken
|
| 140 |
TOKEN_LIMIT_PER_FRAGMENT = 1600
|
| 141 |
+
generated_conclusion_files = []
|
| 142 |
for index, fp in enumerate(file_manifest):
|
| 143 |
# 读取PDF文件
|
| 144 |
file_content, page_one = read_and_clean_pdf_text(fp)
|
|
|
|
| 174 |
|
| 175 |
final = ["", paper_meta_info + '\n\n---\n\n---\n\n---\n\n']
|
| 176 |
final.extend(gpt_response_collection)
|
| 177 |
+
create_report_file_name = f"{os.path.basename(fp)}.trans.md"
|
| 178 |
+
res = write_results_to_file(final, file_name=create_report_file_name)
|
| 179 |
+
generated_conclusion_files.append(f'./gpt_log/{create_report_file_name}')
|
| 180 |
chatbot.append((f"{fp}完成了吗?", res)); msg = "完成"
|
| 181 |
yield chatbot, history, msg
|
| 182 |
|
| 183 |
+
# 准备文件的下载
|
| 184 |
+
import shutil
|
| 185 |
+
for pdf_path in generated_conclusion_files:
|
| 186 |
+
# 重命名文件
|
| 187 |
+
rename_file = f'./gpt_log/总结论文-{os.path.basename(pdf_path)}'
|
| 188 |
+
if os.path.exists(rename_file): os.remove(rename_file)
|
| 189 |
+
shutil.copyfile(pdf_path, rename_file);
|
| 190 |
+
if os.path.exists(pdf_path): os.remove(pdf_path)
|
| 191 |
+
chatbot.append(("给出输出文件清单", str(generated_conclusion_files)))
|
| 192 |
+
yield chatbot, history, msg
|