Spaces:
Running
Running
update word
Browse files- db/mongodb.py +1 -1
- excel/excel_translate.py +1 -2
- pages/upload.py +3 -3
- test.ipynb +124 -46
- word/word_translate.py +138 -180
db/mongodb.py
CHANGED
@@ -13,7 +13,7 @@ def connect_mongodb(db_name, collection_name):
|
|
13 |
|
14 |
|
15 |
|
16 |
-
def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file"
|
17 |
"""
|
18 |
Lưu file vào MongoDB bằng GridFS mà không kiểm tra trùng lặp.
|
19 |
|
|
|
13 |
|
14 |
|
15 |
|
16 |
+
def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file"):
|
17 |
"""
|
18 |
Lưu file vào MongoDB bằng GridFS mà không kiểm tra trùng lặp.
|
19 |
|
excel/excel_translate.py
CHANGED
@@ -94,7 +94,7 @@ def read_csv_with_auto_encoding(csv_path):
|
|
94 |
return df
|
95 |
|
96 |
|
97 |
-
def translate_csv(file_id,
|
98 |
# Kết nối MongoDB
|
99 |
client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
100 |
db = client[db_name]
|
@@ -143,7 +143,6 @@ def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", ch
|
|
143 |
# Now call your LLM translator on this dictionary
|
144 |
translated_chunk = translate_text_dict(
|
145 |
text_dict=chunk_dict,
|
146 |
-
source_lang=source_lang,
|
147 |
target_lang=target_lang,
|
148 |
gemini_api=gemini_api
|
149 |
)
|
|
|
94 |
return df
|
95 |
|
96 |
|
97 |
+
def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
|
98 |
# Kết nối MongoDB
|
99 |
client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
100 |
db = client[db_name]
|
|
|
143 |
# Now call your LLM translator on this dictionary
|
144 |
translated_chunk = translate_text_dict(
|
145 |
text_dict=chunk_dict,
|
|
|
146 |
target_lang=target_lang,
|
147 |
gemini_api=gemini_api
|
148 |
)
|
pages/upload.py
CHANGED
@@ -7,7 +7,7 @@ from powerpoint.xml_handling import (
|
|
7 |
from translate.translator import translate_text_dict
|
8 |
from powerpoint.pptx_object import create_translated_ppt
|
9 |
from excel.excel_translate import translate_xlsx, translate_csv
|
10 |
-
from word.word_translate import
|
11 |
|
12 |
import dotenv
|
13 |
import os
|
@@ -18,7 +18,7 @@ dotenv.load_dotenv(".env")
|
|
18 |
# Cấu hình API key
|
19 |
api_key = os.getenv("GEMINI_API_KEY")
|
20 |
genai.configure(api_key=api_key)
|
21 |
-
model = genai.GenerativeModel("gemini-
|
22 |
|
23 |
# Giao diện Streamlit
|
24 |
st.title("Please chose your PPTX, Excel file to translate")
|
@@ -116,7 +116,7 @@ if uploaded_file is not None:
|
|
116 |
file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file")
|
117 |
st.write(f"File ID: {file_id}")
|
118 |
|
119 |
-
final_id =
|
120 |
st.write(f"Final CSV ID: {final_id}")
|
121 |
if final_id:
|
122 |
st.write("✅ File đã sẵn sàng để tải xuống!")
|
|
|
7 |
from translate.translator import translate_text_dict
|
8 |
from powerpoint.pptx_object import create_translated_ppt
|
9 |
from excel.excel_translate import translate_xlsx, translate_csv
|
10 |
+
from word.word_translate import translate_docx_from_mongodb
|
11 |
|
12 |
import dotenv
|
13 |
import os
|
|
|
18 |
# Cấu hình API key
|
19 |
api_key = os.getenv("GEMINI_API_KEY")
|
20 |
genai.configure(api_key=api_key)
|
21 |
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
22 |
|
23 |
# Giao diện Streamlit
|
24 |
st.title("Please chose your PPTX, Excel file to translate")
|
|
|
116 |
file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file")
|
117 |
st.write(f"File ID: {file_id}")
|
118 |
|
119 |
+
final_id = translate_docx_from_mongodb(file_id = file_id, target_lang="Vietnamese")
|
120 |
st.write(f"Final CSV ID: {final_id}")
|
121 |
if final_id:
|
122 |
st.write("✅ File đã sẵn sàng để tải xuống!")
|
test.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -23,9 +23,17 @@
|
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
-
"execution_count":
|
27 |
"metadata": {},
|
28 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"source": [
|
30 |
"from pymongo import MongoClient\n",
|
31 |
"\n",
|
@@ -79,7 +87,7 @@
|
|
79 |
},
|
80 |
{
|
81 |
"cell_type": "code",
|
82 |
-
"execution_count":
|
83 |
"metadata": {},
|
84 |
"outputs": [],
|
85 |
"source": [
|
@@ -92,7 +100,7 @@
|
|
92 |
" :param collection_name: Tên collection GridFS\n",
|
93 |
" \"\"\"\n",
|
94 |
" # Kết nối đến MongoDB\n",
|
95 |
-
" client = MongoClient(\"mongodb://
|
96 |
" db = client[db_name] # Database của bạn\n",
|
97 |
" fs = gridfs.GridFS(db, collection=collection_name) # Collection để lưu file\n",
|
98 |
"\n",
|
@@ -124,7 +132,7 @@
|
|
124 |
},
|
125 |
{
|
126 |
"cell_type": "code",
|
127 |
-
"execution_count":
|
128 |
"metadata": {},
|
129 |
"outputs": [],
|
130 |
"source": [
|
@@ -137,7 +145,7 @@
|
|
137 |
" \"\"\"\n",
|
138 |
" try:\n",
|
139 |
" # Kết nối MongoDB\n",
|
140 |
-
" client = MongoClient(\"mongodb://
|
141 |
" db = client[db_name]\n",
|
142 |
"\n",
|
143 |
" # Khởi tạo GridFS với collection được chỉ định\n",
|
@@ -178,7 +186,7 @@
|
|
178 |
},
|
179 |
{
|
180 |
"cell_type": "code",
|
181 |
-
"execution_count":
|
182 |
"metadata": {},
|
183 |
"outputs": [],
|
184 |
"source": [
|
@@ -191,7 +199,7 @@
|
|
191 |
" :param collection_name: Tên collection GridFS\n",
|
192 |
" \"\"\"\n",
|
193 |
" # Kết nối đến MongoDB\n",
|
194 |
-
" client = MongoClient(\"mongodb://
|
195 |
" db = client[db_name]\n",
|
196 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
197 |
"\n",
|
@@ -219,40 +227,56 @@
|
|
219 |
},
|
220 |
{
|
221 |
"cell_type": "code",
|
222 |
-
"execution_count":
|
223 |
"metadata": {},
|
224 |
"outputs": [],
|
225 |
"source": [
|
226 |
-
"def
|
227 |
-
" \"\"\
|
228 |
-
" Tải file PowerPoint từ MongoDB GridFS và lưu về máy.\n",
|
229 |
-
" \n",
|
230 |
-
" :param file_id: ID của file cần tải (dạng chuỗi hoặc ObjectId)\n",
|
231 |
-
" :param save_path: Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')\n",
|
232 |
-
" :param save_name: Tên file khi lưu (VD: 'my_presentation.pptx')\n",
|
233 |
-
" :param db_name: Tên database trong MongoDB (mặc định: 'ppt')\n",
|
234 |
-
" :param collection_name: Tên collection GridFS (mặc định: 'root_file')\n",
|
235 |
-
" \"\"\"\n",
|
236 |
-
" # Đảm bảo thư mục lưu file tồn tại\n",
|
237 |
-
" os.makedirs(save_path, exist_ok=True)\n",
|
238 |
"\n",
|
239 |
-
"
|
240 |
-
" full_file_path = os.path.join(save_path, save_name)\n",
|
241 |
"\n",
|
242 |
-
"
|
243 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
" db = client[db_name]\n",
|
245 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
246 |
"\n",
|
247 |
" try:\n",
|
248 |
-
" # Chuyển đổi ID nếu cần\n",
|
249 |
" if not isinstance(file_id, ObjectId):\n",
|
250 |
" file_id = ObjectId(file_id)\n",
|
251 |
"\n",
|
252 |
-
" # Lấy dữ liệu file từ GridFS\n",
|
253 |
" file_data = fs.get(file_id)\n",
|
254 |
" \n",
|
255 |
-
" # Ghi dữ liệu ra file\n",
|
256 |
" with open(full_file_path, \"wb\") as f:\n",
|
257 |
" f.write(file_data.read())\n",
|
258 |
"\n",
|
@@ -265,7 +289,41 @@
|
|
265 |
},
|
266 |
{
|
267 |
"cell_type": "code",
|
268 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
"metadata": {},
|
270 |
"outputs": [],
|
271 |
"source": [
|
@@ -278,7 +336,7 @@
|
|
278 |
" :param db_name: Tên database MongoDB\n",
|
279 |
" :param collection_name: Tên collection GridFS\n",
|
280 |
" \"\"\"\n",
|
281 |
-
" client = MongoClient(\"mongodb://
|
282 |
" db = client[db_name]\n",
|
283 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
284 |
"\n",
|
@@ -342,7 +400,7 @@
|
|
342 |
" :return: ID của file XML trong MongoDB (original_xml)\n",
|
343 |
" \"\"\"\n",
|
344 |
" # Kết nối MongoDB\n",
|
345 |
-
" client = MongoClient(\"mongodb://
|
346 |
" db = client[db_name]\n",
|
347 |
"\n",
|
348 |
" fs_ppt = gridfs.GridFS(db, collection=\"original_pptx\") # PPT gốc\n",
|
@@ -391,7 +449,7 @@
|
|
391 |
},
|
392 |
{
|
393 |
"cell_type": "code",
|
394 |
-
"execution_count":
|
395 |
"metadata": {},
|
396 |
"outputs": [],
|
397 |
"source": [
|
@@ -406,7 +464,7 @@
|
|
406 |
" :return: Dictionary {slide_number: [text1, text2, ...]}\n",
|
407 |
" \"\"\"\n",
|
408 |
" # Kết nối MongoDB\n",
|
409 |
-
" client = MongoClient(\"mongodb://
|
410 |
" db = client[db_name]\n",
|
411 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
412 |
"\n",
|
@@ -472,7 +530,7 @@
|
|
472 |
},
|
473 |
{
|
474 |
"cell_type": "code",
|
475 |
-
"execution_count":
|
476 |
"metadata": {},
|
477 |
"outputs": [],
|
478 |
"source": [
|
@@ -485,7 +543,7 @@
|
|
485 |
" :param db_name: Tên database MongoDB\n",
|
486 |
" \"\"\"\n",
|
487 |
" # Kết nối MongoDB\n",
|
488 |
-
" client = MongoClient(\"mongodb://
|
489 |
" db = client[db_name]\n",
|
490 |
" \n",
|
491 |
" fs_original = gridfs.GridFS(db, collection=\"original_xml\") # Lấy file từ original_xml\n",
|
@@ -644,10 +702,10 @@
|
|
644 |
"name": "stdout",
|
645 |
"output_type": "stream",
|
646 |
"text": [
|
647 |
-
"✅ Đã xóa
|
648 |
"✅ Đã xóa 1 file trong collection 'final_pptx'\n",
|
649 |
-
"✅ Đã xóa
|
650 |
-
"✅ Đã xóa
|
651 |
]
|
652 |
}
|
653 |
],
|
@@ -656,6 +714,25 @@
|
|
656 |
" delete_all_files_in_collection(i)"
|
657 |
]
|
658 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
{
|
660 |
"cell_type": "code",
|
661 |
"execution_count": 19,
|
@@ -675,13 +752,13 @@
|
|
675 |
},
|
676 |
{
|
677 |
"cell_type": "code",
|
678 |
-
"execution_count":
|
679 |
"metadata": {},
|
680 |
"outputs": [],
|
681 |
"source": [
|
682 |
"def file_list(collection=\"root_file\"):\n",
|
683 |
-
" client = MongoClient(\"mongodb://
|
684 |
-
" db = client[\"
|
685 |
" fs = gridfs.GridFS(db, collection=collection)\n",
|
686 |
" for file in fs.find():\n",
|
687 |
" print(f\"📂 File: {file.filename} - ID: {file._id}\")"
|
@@ -696,16 +773,17 @@
|
|
696 |
},
|
697 |
{
|
698 |
"cell_type": "code",
|
699 |
-
"execution_count":
|
700 |
"metadata": {},
|
701 |
"outputs": [
|
702 |
{
|
703 |
"name": "stdout",
|
704 |
"output_type": "stream",
|
705 |
"text": [
|
706 |
-
"📂 File:
|
707 |
-
"📂 File:
|
708 |
-
"📂 File:
|
|
|
709 |
]
|
710 |
}
|
711 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 5,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
+
"execution_count": 1,
|
27 |
"metadata": {},
|
28 |
+
"outputs": [
|
29 |
+
{
|
30 |
+
"name": "stdout",
|
31 |
+
"output_type": "stream",
|
32 |
+
"text": [
|
33 |
+
"Kết nối thành công!\n"
|
34 |
+
]
|
35 |
+
}
|
36 |
+
],
|
37 |
"source": [
|
38 |
"from pymongo import MongoClient\n",
|
39 |
"\n",
|
|
|
87 |
},
|
88 |
{
|
89 |
"cell_type": "code",
|
90 |
+
"execution_count": null,
|
91 |
"metadata": {},
|
92 |
"outputs": [],
|
93 |
"source": [
|
|
|
100 |
" :param collection_name: Tên collection GridFS\n",
|
101 |
" \"\"\"\n",
|
102 |
" # Kết nối đến MongoDB\n",
|
103 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
104 |
" db = client[db_name] # Database của bạn\n",
|
105 |
" fs = gridfs.GridFS(db, collection=collection_name) # Collection để lưu file\n",
|
106 |
"\n",
|
|
|
132 |
},
|
133 |
{
|
134 |
"cell_type": "code",
|
135 |
+
"execution_count": 10,
|
136 |
"metadata": {},
|
137 |
"outputs": [],
|
138 |
"source": [
|
|
|
145 |
" \"\"\"\n",
|
146 |
" try:\n",
|
147 |
" # Kết nối MongoDB\n",
|
148 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
149 |
" db = client[db_name]\n",
|
150 |
"\n",
|
151 |
" # Khởi tạo GridFS với collection được chỉ định\n",
|
|
|
186 |
},
|
187 |
{
|
188 |
"cell_type": "code",
|
189 |
+
"execution_count": 3,
|
190 |
"metadata": {},
|
191 |
"outputs": [],
|
192 |
"source": [
|
|
|
199 |
" :param collection_name: Tên collection GridFS\n",
|
200 |
" \"\"\"\n",
|
201 |
" # Kết nối đến MongoDB\n",
|
202 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
203 |
" db = client[db_name]\n",
|
204 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
205 |
"\n",
|
|
|
227 |
},
|
228 |
{
|
229 |
"cell_type": "code",
|
230 |
+
"execution_count": 25,
|
231 |
"metadata": {},
|
232 |
"outputs": [],
|
233 |
"source": [
|
234 |
+
"def download_input_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n",
|
235 |
+
" os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\input\", exist_ok=True)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
"\n",
|
237 |
+
" full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\input\", save_name)\n",
|
|
|
238 |
"\n",
|
239 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
240 |
+
" db = client[db_name]\n",
|
241 |
+
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
242 |
+
"\n",
|
243 |
+
" try:\n",
|
244 |
+
" if not isinstance(file_id, ObjectId):\n",
|
245 |
+
" file_id = ObjectId(file_id)\n",
|
246 |
+
"\n",
|
247 |
+
" file_data = fs.get(file_id)\n",
|
248 |
+
" \n",
|
249 |
+
" with open(full_file_path, \"wb\") as f:\n",
|
250 |
+
" f.write(file_data.read())\n",
|
251 |
+
"\n",
|
252 |
+
" print(f\"✅ File đã được tải về: {full_file_path}\")\n",
|
253 |
+
" except Exception as e:\n",
|
254 |
+
" print(f\"❌ Lỗi khi tải file: {e}\")\n",
|
255 |
+
" finally:\n",
|
256 |
+
" client.close()"
|
257 |
+
]
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"execution_count": 27,
|
262 |
+
"metadata": {},
|
263 |
+
"outputs": [],
|
264 |
+
"source": [
|
265 |
+
"def download_output_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n",
|
266 |
+
" os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\output\", exist_ok=True)\n",
|
267 |
+
"\n",
|
268 |
+
" full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\output\", save_name)\n",
|
269 |
+
"\n",
|
270 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
271 |
" db = client[db_name]\n",
|
272 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
273 |
"\n",
|
274 |
" try:\n",
|
|
|
275 |
" if not isinstance(file_id, ObjectId):\n",
|
276 |
" file_id = ObjectId(file_id)\n",
|
277 |
"\n",
|
|
|
278 |
" file_data = fs.get(file_id)\n",
|
279 |
" \n",
|
|
|
280 |
" with open(full_file_path, \"wb\") as f:\n",
|
281 |
" f.write(file_data.read())\n",
|
282 |
"\n",
|
|
|
289 |
},
|
290 |
{
|
291 |
"cell_type": "code",
|
292 |
+
"execution_count": 29,
|
293 |
+
"metadata": {},
|
294 |
+
"outputs": [
|
295 |
+
{
|
296 |
+
"name": "stdout",
|
297 |
+
"output_type": "stream",
|
298 |
+
"text": [
|
299 |
+
"✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\chuong 8 NHTM.pptx\n"
|
300 |
+
]
|
301 |
+
}
|
302 |
+
],
|
303 |
+
"source": [
|
304 |
+
"download_input_from_mongodb(file_id=\"67dd7148972b1aa4dc9fb83d\", save_name=\"chuong 8 NHTM.pptx\", db_name=\"ppt\", collection_name=\"root_file\")"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"cell_type": "code",
|
309 |
+
"execution_count": 28,
|
310 |
+
"metadata": {},
|
311 |
+
"outputs": [
|
312 |
+
{
|
313 |
+
"name": "stdout",
|
314 |
+
"output_type": "stream",
|
315 |
+
"text": [
|
316 |
+
"✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\chuong 8 NHTM.pptx\n"
|
317 |
+
]
|
318 |
+
}
|
319 |
+
],
|
320 |
+
"source": [
|
321 |
+
"download_output_from_mongodb(file_id=\"67dd717f972b1aa4dc9fb84f\", save_name=\"chuong 8 NHTM.pptx\", db_name=\"ppt\", collection_name=\"final_pptx\")"
|
322 |
+
]
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"cell_type": "code",
|
326 |
+
"execution_count": null,
|
327 |
"metadata": {},
|
328 |
"outputs": [],
|
329 |
"source": [
|
|
|
336 |
" :param db_name: Tên database MongoDB\n",
|
337 |
" :param collection_name: Tên collection GridFS\n",
|
338 |
" \"\"\"\n",
|
339 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
340 |
" db = client[db_name]\n",
|
341 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
342 |
"\n",
|
|
|
400 |
" :return: ID của file XML trong MongoDB (original_xml)\n",
|
401 |
" \"\"\"\n",
|
402 |
" # Kết nối MongoDB\n",
|
403 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
404 |
" db = client[db_name]\n",
|
405 |
"\n",
|
406 |
" fs_ppt = gridfs.GridFS(db, collection=\"original_pptx\") # PPT gốc\n",
|
|
|
449 |
},
|
450 |
{
|
451 |
"cell_type": "code",
|
452 |
+
"execution_count": null,
|
453 |
"metadata": {},
|
454 |
"outputs": [],
|
455 |
"source": [
|
|
|
464 |
" :return: Dictionary {slide_number: [text1, text2, ...]}\n",
|
465 |
" \"\"\"\n",
|
466 |
" # Kết nối MongoDB\n",
|
467 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
468 |
" db = client[db_name]\n",
|
469 |
" fs = gridfs.GridFS(db, collection=collection_name)\n",
|
470 |
"\n",
|
|
|
530 |
},
|
531 |
{
|
532 |
"cell_type": "code",
|
533 |
+
"execution_count": null,
|
534 |
"metadata": {},
|
535 |
"outputs": [],
|
536 |
"source": [
|
|
|
543 |
" :param db_name: Tên database MongoDB\n",
|
544 |
" \"\"\"\n",
|
545 |
" # Kết nối MongoDB\n",
|
546 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
547 |
" db = client[db_name]\n",
|
548 |
" \n",
|
549 |
" fs_original = gridfs.GridFS(db, collection=\"original_xml\") # Lấy file từ original_xml\n",
|
|
|
702 |
"name": "stdout",
|
703 |
"output_type": "stream",
|
704 |
"text": [
|
705 |
+
"✅ Đã xóa 4 file trong collection 'root_file'\n",
|
706 |
"✅ Đã xóa 1 file trong collection 'final_pptx'\n",
|
707 |
+
"✅ Đã xóa 1 file trong collection 'original_xml'\n",
|
708 |
+
"✅ Đã xóa 1 file trong collection 'final_xml'\n"
|
709 |
]
|
710 |
}
|
711 |
],
|
|
|
714 |
" delete_all_files_in_collection(i)"
|
715 |
]
|
716 |
},
|
717 |
+
{
|
718 |
+
"cell_type": "code",
|
719 |
+
"execution_count": 17,
|
720 |
+
"metadata": {},
|
721 |
+
"outputs": [
|
722 |
+
{
|
723 |
+
"name": "stdout",
|
724 |
+
"output_type": "stream",
|
725 |
+
"text": [
|
726 |
+
"✅ Đã xóa 5 file trong collection 'root_file'\n",
|
727 |
+
"✅ Đã xóa 2 file trong collection 'final_file'\n"
|
728 |
+
]
|
729 |
+
}
|
730 |
+
],
|
731 |
+
"source": [
|
732 |
+
"for i in ['root_file', 'final_file']:\n",
|
733 |
+
" delete_all_files_in_collection(i, db_name=\"excel\")"
|
734 |
+
]
|
735 |
+
},
|
736 |
{
|
737 |
"cell_type": "code",
|
738 |
"execution_count": 19,
|
|
|
752 |
},
|
753 |
{
|
754 |
"cell_type": "code",
|
755 |
+
"execution_count": 7,
|
756 |
"metadata": {},
|
757 |
"outputs": [],
|
758 |
"source": [
|
759 |
"def file_list(collection=\"root_file\"):\n",
|
760 |
+
" client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
|
761 |
+
" db = client[\"ppt\"]\n",
|
762 |
" fs = gridfs.GridFS(db, collection=collection)\n",
|
763 |
" for file in fs.find():\n",
|
764 |
" print(f\"📂 File: {file.filename} - ID: {file._id}\")"
|
|
|
773 |
},
|
774 |
{
|
775 |
"cell_type": "code",
|
776 |
+
"execution_count": 8,
|
777 |
"metadata": {},
|
778 |
"outputs": [
|
779 |
{
|
780 |
"name": "stdout",
|
781 |
"output_type": "stream",
|
782 |
"text": [
|
783 |
+
"📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcd8c575cfef63155d3f91\n",
|
784 |
+
"📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcea4f02257ad0cb04610e\n",
|
785 |
+
"📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcead0143da29a5c6321ab\n",
|
786 |
+
"📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dd3bf23cf7ee2f6eca902e\n"
|
787 |
]
|
788 |
}
|
789 |
],
|
word/word_translate.py
CHANGED
@@ -1,60 +1,97 @@
|
|
1 |
-
import os
|
2 |
import docx
|
3 |
from docx import Document
|
4 |
import google.generativeai as genai
|
5 |
import ast
|
6 |
import json
|
7 |
-
|
8 |
-
|
|
|
9 |
import io
|
|
|
10 |
from pymongo import MongoClient
|
11 |
from gridfs import GridFS
|
12 |
from docx import Document
|
13 |
-
from deep_translator import GoogleTranslator
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
18 |
|
19 |
-
def batch_translate(texts,
|
20 |
""" Translates multiple text segments in a single API call. """
|
21 |
if not texts:
|
22 |
return texts # Skip if empty
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
{json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])}
|
27 |
-
|
28 |
- The original JSON file contains a Python array of objects, each with "index" and "text" keys.
|
29 |
- Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
- Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
|
31 |
-
- Return
|
32 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
"""
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
model=
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def merge_runs(runs):
|
44 |
""" Merges adjacent runs with the same style. """
|
45 |
merged_runs = []
|
46 |
for run in runs:
|
47 |
-
if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run)
|
48 |
-
if (
|
49 |
-
merged_runs and
|
50 |
run.style == merged_runs[-1].style and
|
51 |
merged_runs[-1].bold == run.bold and
|
52 |
merged_runs[-1].italic == run.italic and
|
53 |
merged_runs[-1].underline == run.underline and
|
54 |
merged_runs[-1].font.size == run.font.size and
|
55 |
merged_runs[-1].font.color.rgb == run.font.color.rgb and
|
56 |
-
merged_runs[-1].font.name == run.font.name
|
57 |
-
):
|
58 |
merged_runs[-1].text += run.text
|
59 |
else:
|
60 |
merged_runs.append(run)
|
@@ -62,146 +99,7 @@ def merge_runs(runs):
|
|
62 |
|
63 |
NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
64 |
|
65 |
-
def
|
66 |
-
paragraphs = []
|
67 |
-
for para in doc.paragraphs:
|
68 |
-
for run in merge_runs(para.iter_inner_content()):
|
69 |
-
if isinstance(run, docx.text.run.Run):
|
70 |
-
paragraphs.append(run.text)
|
71 |
-
# paragraphs = merge_runs(paragraphs)
|
72 |
-
translated_paragraphs = []
|
73 |
-
temp_batch = []
|
74 |
-
words = 0
|
75 |
-
for para in paragraphs:
|
76 |
-
if len(para) + words > 5000:
|
77 |
-
translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
|
78 |
-
temp_batch = []
|
79 |
-
words = 0
|
80 |
-
words += len(para)
|
81 |
-
temp_batch.append(para)
|
82 |
-
translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
|
83 |
-
# translated_paragraphs = batch_translate(paragraphs, target_lang)
|
84 |
-
|
85 |
-
if len(translated_paragraphs) > 0:
|
86 |
-
# Replace translated text back
|
87 |
-
para_index = 0
|
88 |
-
for para in doc.paragraphs:
|
89 |
-
original_para = deepcopy(para)
|
90 |
-
para.clear() # Remove text while keeping paragraph properties
|
91 |
-
for run in merge_runs(original_para.iter_inner_content()):
|
92 |
-
if isinstance(run, docx.text.run.Run):
|
93 |
-
translated_text = translated_paragraphs[para_index]
|
94 |
-
try:
|
95 |
-
translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
|
96 |
-
except UnicodeEncodeError:
|
97 |
-
translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
|
98 |
-
drawing = run._element.find(f".//{NS_W}drawing")
|
99 |
-
pict = run._element.find(".//{NS_W}pict")
|
100 |
-
|
101 |
-
# Create a new run with translated text and copy the formatting
|
102 |
-
new_run = para.add_run(translated_text)
|
103 |
-
new_run.style = run.style
|
104 |
-
|
105 |
-
if drawing is not None:
|
106 |
-
new_run._element.append(drawing)
|
107 |
-
elif pict is not None:
|
108 |
-
new_run._element.append(pict)
|
109 |
-
|
110 |
-
# Copy formatting from original run
|
111 |
-
new_run.bold = run.bold
|
112 |
-
new_run.italic = run.italic
|
113 |
-
new_run.underline = run.underline
|
114 |
-
new_run.font.size = run.font.size
|
115 |
-
new_run.font.color.rgb = run.font.color.rgb
|
116 |
-
new_run.font.name = run.font.name
|
117 |
-
para_index += 1
|
118 |
-
elif isinstance(run, docx.text.hyperlink.Hyperlink):
|
119 |
-
parent = run._element
|
120 |
-
tag = parent.tag.split("}")[-1]
|
121 |
-
|
122 |
-
# Create a new hyperlink element with the correct namespace
|
123 |
-
new_hyperlink = OxmlElement(f"w:{tag}")
|
124 |
-
for attr in parent.attrib:
|
125 |
-
new_hyperlink.set(attr, parent.get(attr))
|
126 |
-
for child in parent:
|
127 |
-
new_hyperlink.append(child)
|
128 |
-
para._element.append(new_hyperlink)
|
129 |
-
|
130 |
-
|
131 |
-
def translate_tables(doc, source_lang, target_lang):
|
132 |
-
table_texts = []
|
133 |
-
run_mapping = {}
|
134 |
-
|
135 |
-
|
136 |
-
for table in doc.tables:
|
137 |
-
for row in table.rows:
|
138 |
-
for cell in row.cells:
|
139 |
-
for para in cell.paragraphs:
|
140 |
-
for run in merge_runs(para.iter_inner_content()):
|
141 |
-
if isinstance(run, docx.text.run.Run):
|
142 |
-
table_texts.append(run.text)
|
143 |
-
|
144 |
-
translated_tables = []
|
145 |
-
temp_batch = []
|
146 |
-
words = 0
|
147 |
-
for para in table_texts:
|
148 |
-
if len(para) + words > 5000:
|
149 |
-
translated_tables += batch_translate(temp_batch, source_lang, target_lang)
|
150 |
-
temp_batch = []
|
151 |
-
words = 0
|
152 |
-
words += len(para)
|
153 |
-
temp_batch.append(para)
|
154 |
-
translated_tables += batch_translate(temp_batch, source_lang, target_lang)
|
155 |
-
# translated_tables = batch_translate(table_texts, target_lang)
|
156 |
-
|
157 |
-
if len(translated_tables) > 0:
|
158 |
-
table_index = 0
|
159 |
-
for table in doc.tables:
|
160 |
-
for row in table.rows:
|
161 |
-
for cell in row.cells:
|
162 |
-
for para in cell.paragraphs:
|
163 |
-
original_para = deepcopy(para)
|
164 |
-
para.clear() # Remove text while keeping paragraph properties
|
165 |
-
for run in merge_runs(original_para.iter_inner_content()):
|
166 |
-
if isinstance(run, docx.text.run.Run):
|
167 |
-
translated_text = translated_tables[table_index]
|
168 |
-
try:
|
169 |
-
translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
|
170 |
-
except UnicodeEncodeError:
|
171 |
-
translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
|
172 |
-
drawing = run._element.find(f".//{NS_W}drawing")
|
173 |
-
pict = run._element.find(".//{NS_W}pict")
|
174 |
-
|
175 |
-
# Create a new run with translated text and copy the formatting
|
176 |
-
new_run = para.add_run(translated_text)
|
177 |
-
new_run.style = run.style
|
178 |
-
|
179 |
-
if drawing is not None:
|
180 |
-
new_run._element.append(drawing)
|
181 |
-
elif pict is not None:
|
182 |
-
new_run._element.append(pict)
|
183 |
-
|
184 |
-
# Copy formatting from original run
|
185 |
-
new_run.bold = run.bold
|
186 |
-
new_run.italic = run.italic
|
187 |
-
new_run.underline = run.underline
|
188 |
-
new_run.font.size = run.font.size
|
189 |
-
new_run.font.color.rgb = run.font.color.rgb
|
190 |
-
new_run.font.name = run.font.name
|
191 |
-
table_index += 1
|
192 |
-
elif isinstance(run, docx.text.hyperlink.Hyperlink):
|
193 |
-
parent = run._element
|
194 |
-
tag = parent.tag.split("}")[-1]
|
195 |
-
|
196 |
-
# Create a new hyperlink element with the correct namespace
|
197 |
-
new_hyperlink = OxmlElement(f"w:{tag}")
|
198 |
-
for attr in parent.attrib:
|
199 |
-
new_hyperlink.set(attr, parent.get(attr))
|
200 |
-
for child in parent:
|
201 |
-
new_hyperlink.append(child)
|
202 |
-
para._element.append(new_hyperlink)
|
203 |
-
|
204 |
-
def translate_header_footer(doc, source_lang, target_lang):
|
205 |
head_foot = []
|
206 |
for section in doc.sections:
|
207 |
for header in section.header.paragraphs:
|
@@ -210,7 +108,7 @@ def translate_header_footer(doc, source_lang, target_lang):
|
|
210 |
for footer in section.footer.paragraphs:
|
211 |
for run in footer.runs:
|
212 |
head_foot.append(run.text)
|
213 |
-
translated_head_foot =
|
214 |
|
215 |
i = 0
|
216 |
for section in doc.sections:
|
@@ -222,25 +120,85 @@ def translate_header_footer(doc, source_lang, target_lang):
|
|
222 |
for run in footer.runs:
|
223 |
run.text = translated_head_foot[i]
|
224 |
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
-
def
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
fs_input = GridFS(db, collection="root_file")
|
230 |
fs_output = GridFS(db, collection="final_file")
|
231 |
|
|
|
232 |
file_data = fs_input.get(file_id).read()
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
238 |
|
|
|
239 |
output_stream = io.BytesIO()
|
240 |
-
|
241 |
output_stream.seek(0)
|
242 |
|
243 |
-
translated_file_id = fs_output.put(output_stream, filename=
|
244 |
-
|
245 |
|
246 |
-
return translated_file_id
|
|
|
|
|
1 |
import docx
|
2 |
from docx import Document
|
3 |
import google.generativeai as genai
|
4 |
import ast
|
5 |
import json
|
6 |
+
import re
|
7 |
+
import dotenv
|
8 |
+
import os
|
9 |
import io
|
10 |
+
|
11 |
from pymongo import MongoClient
|
12 |
from gridfs import GridFS
|
13 |
from docx import Document
|
|
|
14 |
|
15 |
+
dotenv.load_dotenv(".env")
|
16 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
17 |
+
genai.configure(api_key=api_key)
|
18 |
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
19 |
|
20 |
+
def batch_translate(texts, target_lang="Vietnamese"):
|
21 |
""" Translates multiple text segments in a single API call. """
|
22 |
if not texts:
|
23 |
return texts # Skip if empty
|
24 |
+
|
25 |
+
system_prompt = """ You are given three inputs: source language, target language and a json file.
|
26 |
+
- Your task is to translate the JSON file from the source language (you have to detect the source language yourself) to the target language.
|
|
|
|
|
27 |
- The original JSON file contains a Python array of objects, each with "index" and "text" keys.
|
28 |
- Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
|
29 |
+
- The array contains text that makes up whole paragraphs. Make sure that the translation makes sense when the text is put together and retains the same context.
|
30 |
+
- This is very important: Empty spaces should be left as is. For example: From English, "Hello " should be translated into Vietnamese as "Xin chào ", with the same space at the end.
|
31 |
+
- Very frequently there are spaces before or after a string. Do not remove these spaces.
|
32 |
+
- If the source language is English and the target language is Vietnamese and a string contains "'s" in the possessive sense, translate it as "của".
|
33 |
+
- Example: [["WorldQuant's"], ["Mission"]] should be translated as [["Nhiệm vụ"], ["của WorldQuant"]]
|
34 |
+
- Words might be split into multiple continuous arrays. Translate them such that the translation corresponds to the full word.
|
35 |
+
- If a word is split up into multiple arrays, the translation should be such that the word is not split up.
|
36 |
+
- Exampe: ['Tesla sells its pro', 'ducts'] should be translated as ['Tesla bán sản phẩm của mình', ''.]. Note that the number of elements in the output is the same as the input.
|
37 |
+
- Example: [["Hello"], ["World"]] should be translated as [["Xin chào"], ["Thế giới"]]
|
38 |
- Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
|
39 |
+
- Return a JSON object that is a Python array.
|
40 |
+
- Each object in the array is a dictionary with two keys: "index" and "text".
|
41 |
+
- The text should be the translated version of the text in the original object, and the index should stay consistent.
|
42 |
+
- The number of objects in the output MUST the same as the number of objects in the input.
|
43 |
+
- The format of the output should look exactly like the example.
|
44 |
+
- Example:
|
45 |
+
**Input**: Target language: Vietnamese. JSON file:
|
46 |
+
[{"index": 0, "text": ["My name is "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 3, "text": ["Today is "]}, {"index": 4, "text": ["a"]}, {"index": 5, "text": ["good day"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]'
|
47 |
+
**Output**: [{"index": 0, "text": ["Tên tôi là "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 4, "text": ["Hôm nay là "]}, {"index": 3, "text": ["Một"]}, {"index": 5, "text": ["ngày đẹp"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]
|
48 |
+
- Return the result of translation according to the format. Do NOT return code for translating.
|
49 |
"""
|
50 |
+
json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
|
51 |
+
user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
|
52 |
+
|
53 |
+
model = genai.GenerativeModel('gemini-2.0-flash')
|
54 |
+
response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
|
55 |
+
'temperature': 1, # Adjust temperature for desired creativity
|
56 |
+
'top_p': 1,
|
57 |
+
'top_k': 1,})
|
58 |
+
response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
|
59 |
+
if len(response_dict) > 0:
|
60 |
+
if isinstance(response_dict[0]['text'], list):
|
61 |
+
translated_texts = [i['text'][0] for i in sorted(response_dict, key = lambda x: x['index'])]
|
62 |
+
elif isinstance(response_dict[0]['text'], str):
|
63 |
+
translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
|
64 |
+
return translated_texts
|
65 |
+
|
66 |
+
def full_translate(texts, target_lang="Vietnamese"):
|
67 |
+
full_translated_texts = []
|
68 |
+
batch = []
|
69 |
+
word_count = 0
|
70 |
+
|
71 |
+
for string in texts:
|
72 |
+
if len(string.split()) + word_count >= 1000:
|
73 |
+
print('Translating a batch.')
|
74 |
+
full_translated_texts += batch_translate(batch, target_lang)
|
75 |
+
batch = []
|
76 |
+
word_count = 0
|
77 |
+
batch.append(string)
|
78 |
+
word_count += len(string.split())
|
79 |
+
|
80 |
+
full_translated_texts += batch_translate(batch, target_lang)
|
81 |
+
return full_translated_texts
|
82 |
|
83 |
def merge_runs(runs):
|
84 |
""" Merges adjacent runs with the same style. """
|
85 |
merged_runs = []
|
86 |
for run in runs:
|
87 |
+
if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and
|
|
|
|
|
88 |
run.style == merged_runs[-1].style and
|
89 |
merged_runs[-1].bold == run.bold and
|
90 |
merged_runs[-1].italic == run.italic and
|
91 |
merged_runs[-1].underline == run.underline and
|
92 |
merged_runs[-1].font.size == run.font.size and
|
93 |
merged_runs[-1].font.color.rgb == run.font.color.rgb and
|
94 |
+
merged_runs[-1].font.name == run.font.name):
|
|
|
95 |
merged_runs[-1].text += run.text
|
96 |
else:
|
97 |
merged_runs.append(run)
|
|
|
99 |
|
100 |
NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
101 |
|
102 |
+
def translate_header_footer(doc, target_lang):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
head_foot = []
|
104 |
for section in doc.sections:
|
105 |
for header in section.header.paragraphs:
|
|
|
108 |
for footer in section.footer.paragraphs:
|
109 |
for run in footer.runs:
|
110 |
head_foot.append(run.text)
|
111 |
+
translated_head_foot = full_translate(head_foot, target_lang)
|
112 |
|
113 |
i = 0
|
114 |
for section in doc.sections:
|
|
|
120 |
for run in footer.runs:
|
121 |
run.text = translated_head_foot[i]
|
122 |
i += 1
|
123 |
+
|
124 |
+
def get_text_elements_para(doc):
|
125 |
+
para_texts = []
|
126 |
+
for para in doc.paragraphs:
|
127 |
+
for element in para._element.iter():
|
128 |
+
if element.tag.endswith('t'):
|
129 |
+
if element.text:
|
130 |
+
emoji_pattern = r'[\U00010000-\U0010FFFF]'
|
131 |
+
# Split the text but keep emojis as separate elements
|
132 |
+
parts = re.split(f'({emoji_pattern})', element.text)
|
133 |
+
for part in parts:
|
134 |
+
if re.match(emoji_pattern, part):
|
135 |
+
continue
|
136 |
+
para_texts.append(part)
|
137 |
+
return para_texts
|
138 |
+
|
139 |
+
def get_text_elements_table(doc):
|
140 |
+
table_texts = []
|
141 |
+
for table in doc.tables:
|
142 |
+
for row in table.rows:
|
143 |
+
for cell in row.cells:
|
144 |
+
table_texts += get_text_elements_para(cell)
|
145 |
+
return table_texts
|
146 |
|
147 |
+
def translate_paragraphs(doc, translated_texts, i = 0):
|
148 |
+
for para in doc.paragraphs:
|
149 |
+
for element in para._element.iter():
|
150 |
+
if element.tag.endswith('t'):
|
151 |
+
if element.text:
|
152 |
+
emoji_pattern = r'[\U00010000-\U0010FFFF]'
|
153 |
+
# Split the text but keep emojis as separate elements
|
154 |
+
parts = re.split(f'({emoji_pattern})', element.text)
|
155 |
+
for j in range(len(parts)):
|
156 |
+
if re.match(emoji_pattern, parts[j]):
|
157 |
+
continue
|
158 |
+
translated_text = translated_texts[i]
|
159 |
+
i += 1
|
160 |
+
parts[j] = translated_text
|
161 |
+
element.text = "".join(parts)
|
162 |
+
return doc, i
|
163 |
+
|
164 |
+
def translate_tables(doc, translated_texts):
|
165 |
+
i = 0
|
166 |
+
for table in doc.tables:
|
167 |
+
for row in table.rows:
|
168 |
+
for cell in row.cells:
|
169 |
+
cell, i = translate_paragraphs(cell, translated_texts, i)
|
170 |
+
return doc
|
171 |
+
|
172 |
+
def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
|
173 |
+
# Kết nối MongoDB
|
174 |
+
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
175 |
+
db = client["word"]
|
176 |
fs_input = GridFS(db, collection="root_file")
|
177 |
fs_output = GridFS(db, collection="final_file")
|
178 |
|
179 |
+
# Lấy file từ MongoDB
|
180 |
file_data = fs_input.get(file_id).read()
|
181 |
+
original_file = fs_input.get(file_id).filename # Lấy tên gốc của file
|
182 |
+
doc = Document(io.BytesIO(file_data))
|
183 |
+
|
184 |
+
# Lấy nội dung và dịch
|
185 |
+
para_texts = get_text_elements_para(doc)
|
186 |
+
translated_para = full_translate(para_texts, target_lang)
|
187 |
+
|
188 |
+
table_texts = get_text_elements_table(doc)
|
189 |
+
translated_tables = full_translate(table_texts, target_lang)
|
190 |
|
191 |
+
# Cập nhật nội dung dịch vào document
|
192 |
+
doc, _ = translate_paragraphs(doc, translated_para)
|
193 |
+
doc = translate_tables(doc, translated_tables)
|
194 |
+
translate_header_footer(doc, target_lang)
|
195 |
|
196 |
+
# Lưu file dịch vào MongoDB với cùng tên gốc
|
197 |
output_stream = io.BytesIO()
|
198 |
+
doc.save(output_stream)
|
199 |
output_stream.seek(0)
|
200 |
|
201 |
+
translated_file_id = fs_output.put(output_stream, filename=original_file)
|
202 |
+
client.close()
|
203 |
|
204 |
+
return translated_file_id
|