mintlee commited on
Commit
e53f591
·
1 Parent(s): 739e7dc

update word

Browse files
Files changed (5) hide show
  1. db/mongodb.py +1 -1
  2. excel/excel_translate.py +1 -2
  3. pages/upload.py +3 -3
  4. test.ipynb +124 -46
  5. word/word_translate.py +138 -180
db/mongodb.py CHANGED
@@ -13,7 +13,7 @@ def connect_mongodb(db_name, collection_name):
13
 
14
 
15
 
16
- def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file", file_tail=".pptx"):
17
  """
18
  Lưu file vào MongoDB bằng GridFS mà không kiểm tra trùng lặp.
19
 
 
13
 
14
 
15
 
16
+ def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file"):
17
  """
18
  Lưu file vào MongoDB bằng GridFS mà không kiểm tra trùng lặp.
19
 
excel/excel_translate.py CHANGED
@@ -94,7 +94,7 @@ def read_csv_with_auto_encoding(csv_path):
94
  return df
95
 
96
 
97
- def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
98
  # Kết nối MongoDB
99
  client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
100
  db = client[db_name]
@@ -143,7 +143,6 @@ def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", ch
143
  # Now call your LLM translator on this dictionary
144
  translated_chunk = translate_text_dict(
145
  text_dict=chunk_dict,
146
- source_lang=source_lang,
147
  target_lang=target_lang,
148
  gemini_api=gemini_api
149
  )
 
94
  return df
95
 
96
 
97
+ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
98
  # Kết nối MongoDB
99
  client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
100
  db = client[db_name]
 
143
  # Now call your LLM translator on this dictionary
144
  translated_chunk = translate_text_dict(
145
  text_dict=chunk_dict,
 
146
  target_lang=target_lang,
147
  gemini_api=gemini_api
148
  )
pages/upload.py CHANGED
@@ -7,7 +7,7 @@ from powerpoint.xml_handling import (
7
  from translate.translator import translate_text_dict
8
  from powerpoint.pptx_object import create_translated_ppt
9
  from excel.excel_translate import translate_xlsx, translate_csv
10
- from word.word_translate import translate_docx
11
 
12
  import dotenv
13
  import os
@@ -18,7 +18,7 @@ dotenv.load_dotenv(".env")
18
  # Cấu hình API key
19
  api_key = os.getenv("GEMINI_API_KEY")
20
  genai.configure(api_key=api_key)
21
- model = genai.GenerativeModel("gemini-1.5-flash")
22
 
23
  # Giao diện Streamlit
24
  st.title("Please chose your PPTX, Excel file to translate")
@@ -116,7 +116,7 @@ if uploaded_file is not None:
116
  file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file")
117
  st.write(f"File ID: {file_id}")
118
 
119
- final_id = translate_docx(file_id=file_id, source_lang="en", target_lang="vi")
120
  st.write(f"Final CSV ID: {final_id}")
121
  if final_id:
122
  st.write("✅ File đã sẵn sàng để tải xuống!")
 
7
  from translate.translator import translate_text_dict
8
  from powerpoint.pptx_object import create_translated_ppt
9
  from excel.excel_translate import translate_xlsx, translate_csv
10
+ from word.word_translate import translate_docx_from_mongodb
11
 
12
  import dotenv
13
  import os
 
18
  # Cấu hình API key
19
  api_key = os.getenv("GEMINI_API_KEY")
20
  genai.configure(api_key=api_key)
21
+ model = genai.GenerativeModel("gemini-2.0-flash")
22
 
23
  # Giao diện Streamlit
24
  st.title("Please chose your PPTX, Excel file to translate")
 
116
  file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file")
117
  st.write(f"File ID: {file_id}")
118
 
119
+ final_id = translate_docx_from_mongodb(file_id = file_id, target_lang="Vietnamese")
120
  st.write(f"Final CSV ID: {final_id}")
121
  if final_id:
122
  st.write("✅ File đã sẵn sàng để tải xuống!")
test.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -23,9 +23,17 @@
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": null,
27
  "metadata": {},
28
- "outputs": [],
 
 
 
 
 
 
 
 
29
  "source": [
30
  "from pymongo import MongoClient\n",
31
  "\n",
@@ -79,7 +87,7 @@
79
  },
80
  {
81
  "cell_type": "code",
82
- "execution_count": 3,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
@@ -92,7 +100,7 @@
92
  " :param collection_name: Tên collection GridFS\n",
93
  " \"\"\"\n",
94
  " # Kết nối đến MongoDB\n",
95
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
96
  " db = client[db_name] # Database của bạn\n",
97
  " fs = gridfs.GridFS(db, collection=collection_name) # Collection để lưu file\n",
98
  "\n",
@@ -124,7 +132,7 @@
124
  },
125
  {
126
  "cell_type": "code",
127
- "execution_count": 6,
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
@@ -137,7 +145,7 @@
137
  " \"\"\"\n",
138
  " try:\n",
139
  " # Kết nối MongoDB\n",
140
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
141
  " db = client[db_name]\n",
142
  "\n",
143
  " # Khởi tạo GridFS với collection được chỉ định\n",
@@ -178,7 +186,7 @@
178
  },
179
  {
180
  "cell_type": "code",
181
- "execution_count": null,
182
  "metadata": {},
183
  "outputs": [],
184
  "source": [
@@ -191,7 +199,7 @@
191
  " :param collection_name: Tên collection GridFS\n",
192
  " \"\"\"\n",
193
  " # Kết nối đến MongoDB\n",
194
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
195
  " db = client[db_name]\n",
196
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
197
  "\n",
@@ -219,40 +227,56 @@
219
  },
220
  {
221
  "cell_type": "code",
222
- "execution_count": 8,
223
  "metadata": {},
224
  "outputs": [],
225
  "source": [
226
- "def download_pptx_from_mongodb(file_id, save_path, save_name, db_name=\"ppt\", collection_name=\"final_xml\"):\n",
227
- " \"\"\"\n",
228
- " Tải file PowerPoint từ MongoDB GridFS và lưu về máy.\n",
229
- " \n",
230
- " :param file_id: ID của file cần tải (dạng chuỗi hoặc ObjectId)\n",
231
- " :param save_path: Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')\n",
232
- " :param save_name: Tên file khi lưu (VD: 'my_presentation.pptx')\n",
233
- " :param db_name: Tên database trong MongoDB (mặc định: 'ppt')\n",
234
- " :param collection_name: Tên collection GridFS (mặc định: 'root_file')\n",
235
- " \"\"\"\n",
236
- " # Đảm bảo thư mục lưu file tồn tại\n",
237
- " os.makedirs(save_path, exist_ok=True)\n",
238
  "\n",
239
- " # Tạo đường dẫn đầy đủ cho file\n",
240
- " full_file_path = os.path.join(save_path, save_name)\n",
241
  "\n",
242
- " # Kết nối đến MongoDB\n",
243
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  " db = client[db_name]\n",
245
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
246
  "\n",
247
  " try:\n",
248
- " # Chuyển đổi ID nếu cần\n",
249
  " if not isinstance(file_id, ObjectId):\n",
250
  " file_id = ObjectId(file_id)\n",
251
  "\n",
252
- " # Lấy dữ liệu file từ GridFS\n",
253
  " file_data = fs.get(file_id)\n",
254
  " \n",
255
- " # Ghi dữ liệu ra file\n",
256
  " with open(full_file_path, \"wb\") as f:\n",
257
  " f.write(file_data.read())\n",
258
  "\n",
@@ -265,7 +289,41 @@
265
  },
266
  {
267
  "cell_type": "code",
268
- "execution_count": 6,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  "metadata": {},
270
  "outputs": [],
271
  "source": [
@@ -278,7 +336,7 @@
278
  " :param db_name: Tên database MongoDB\n",
279
  " :param collection_name: Tên collection GridFS\n",
280
  " \"\"\"\n",
281
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
282
  " db = client[db_name]\n",
283
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
284
  "\n",
@@ -342,7 +400,7 @@
342
  " :return: ID của file XML trong MongoDB (original_xml)\n",
343
  " \"\"\"\n",
344
  " # Kết nối MongoDB\n",
345
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
346
  " db = client[db_name]\n",
347
  "\n",
348
  " fs_ppt = gridfs.GridFS(db, collection=\"original_pptx\") # PPT gốc\n",
@@ -391,7 +449,7 @@
391
  },
392
  {
393
  "cell_type": "code",
394
- "execution_count": 9,
395
  "metadata": {},
396
  "outputs": [],
397
  "source": [
@@ -406,7 +464,7 @@
406
  " :return: Dictionary {slide_number: [text1, text2, ...]}\n",
407
  " \"\"\"\n",
408
  " # Kết nối MongoDB\n",
409
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
410
  " db = client[db_name]\n",
411
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
412
  "\n",
@@ -472,7 +530,7 @@
472
  },
473
  {
474
  "cell_type": "code",
475
- "execution_count": 12,
476
  "metadata": {},
477
  "outputs": [],
478
  "source": [
@@ -485,7 +543,7 @@
485
  " :param db_name: Tên database MongoDB\n",
486
  " \"\"\"\n",
487
  " # Kết nối MongoDB\n",
488
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
489
  " db = client[db_name]\n",
490
  " \n",
491
  " fs_original = gridfs.GridFS(db, collection=\"original_xml\") # Lấy file từ original_xml\n",
@@ -644,10 +702,10 @@
644
  "name": "stdout",
645
  "output_type": "stream",
646
  "text": [
647
- "✅ Đã xóa 6 file trong collection 'root_file'\n",
648
  "✅ Đã xóa 1 file trong collection 'final_pptx'\n",
649
- "✅ Đã xóa 0 file trong collection 'original_xml'\n",
650
- "✅ Đã xóa 0 file trong collection 'final_xml'\n"
651
  ]
652
  }
653
  ],
@@ -656,6 +714,25 @@
656
  " delete_all_files_in_collection(i)"
657
  ]
658
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  {
660
  "cell_type": "code",
661
  "execution_count": 19,
@@ -675,13 +752,13 @@
675
  },
676
  {
677
  "cell_type": "code",
678
- "execution_count": null,
679
  "metadata": {},
680
  "outputs": [],
681
  "source": [
682
  "def file_list(collection=\"root_file\"):\n",
683
- " client = MongoClient(\"mongodb://localhost:27017/\")\n",
684
- " db = client[\"csv\"]\n",
685
  " fs = gridfs.GridFS(db, collection=collection)\n",
686
  " for file in fs.find():\n",
687
  " print(f\"📂 File: {file.filename} - ID: {file._id}\")"
@@ -696,16 +773,17 @@
696
  },
697
  {
698
  "cell_type": "code",
699
- "execution_count": 14,
700
  "metadata": {},
701
  "outputs": [
702
  {
703
  "name": "stdout",
704
  "output_type": "stream",
705
  "text": [
706
- "📂 File: test1.xlsx - ID: 67d849b4ef2fcc7f191324f9\n",
707
- "📂 File: test3.csv - ID: 67d864962cda0e8d5dd832d5\n",
708
- "📂 File: test1.csv - ID: 67d8651a71e13e1efa8d56db\n"
 
709
  ]
710
  }
711
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 5,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
23
  },
24
  {
25
  "cell_type": "code",
26
+ "execution_count": 1,
27
  "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "Kết nối thành công!\n"
34
+ ]
35
+ }
36
+ ],
37
  "source": [
38
  "from pymongo import MongoClient\n",
39
  "\n",
 
87
  },
88
  {
89
  "cell_type": "code",
90
+ "execution_count": null,
91
  "metadata": {},
92
  "outputs": [],
93
  "source": [
 
100
  " :param collection_name: Tên collection GridFS\n",
101
  " \"\"\"\n",
102
  " # Kết nối đến MongoDB\n",
103
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
104
  " db = client[db_name] # Database của bạn\n",
105
  " fs = gridfs.GridFS(db, collection=collection_name) # Collection để lưu file\n",
106
  "\n",
 
132
  },
133
  {
134
  "cell_type": "code",
135
+ "execution_count": 10,
136
  "metadata": {},
137
  "outputs": [],
138
  "source": [
 
145
  " \"\"\"\n",
146
  " try:\n",
147
  " # Kết nối MongoDB\n",
148
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
149
  " db = client[db_name]\n",
150
  "\n",
151
  " # Khởi tạo GridFS với collection được chỉ định\n",
 
186
  },
187
  {
188
  "cell_type": "code",
189
+ "execution_count": 3,
190
  "metadata": {},
191
  "outputs": [],
192
  "source": [
 
199
  " :param collection_name: Tên collection GridFS\n",
200
  " \"\"\"\n",
201
  " # Kết nối đến MongoDB\n",
202
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
203
  " db = client[db_name]\n",
204
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
205
  "\n",
 
227
  },
228
  {
229
  "cell_type": "code",
230
+ "execution_count": 25,
231
  "metadata": {},
232
  "outputs": [],
233
  "source": [
234
+ "def download_input_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n",
235
+ " os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\input\", exist_ok=True)\n",
 
 
 
 
 
 
 
 
 
 
236
  "\n",
237
+ " full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\input\", save_name)\n",
 
238
  "\n",
239
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
240
+ " db = client[db_name]\n",
241
+ " fs = gridfs.GridFS(db, collection=collection_name)\n",
242
+ "\n",
243
+ " try:\n",
244
+ " if not isinstance(file_id, ObjectId):\n",
245
+ " file_id = ObjectId(file_id)\n",
246
+ "\n",
247
+ " file_data = fs.get(file_id)\n",
248
+ " \n",
249
+ " with open(full_file_path, \"wb\") as f:\n",
250
+ " f.write(file_data.read())\n",
251
+ "\n",
252
+ " print(f\"✅ File đã được tải về: {full_file_path}\")\n",
253
+ " except Exception as e:\n",
254
+ " print(f\"❌ Lỗi khi tải file: {e}\")\n",
255
+ " finally:\n",
256
+ " client.close()"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 27,
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "def download_output_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n",
266
+ " os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\output\", exist_ok=True)\n",
267
+ "\n",
268
+ " full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\output\", save_name)\n",
269
+ "\n",
270
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
271
  " db = client[db_name]\n",
272
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
273
  "\n",
274
  " try:\n",
 
275
  " if not isinstance(file_id, ObjectId):\n",
276
  " file_id = ObjectId(file_id)\n",
277
  "\n",
 
278
  " file_data = fs.get(file_id)\n",
279
  " \n",
 
280
  " with open(full_file_path, \"wb\") as f:\n",
281
  " f.write(file_data.read())\n",
282
  "\n",
 
289
  },
290
  {
291
  "cell_type": "code",
292
+ "execution_count": 29,
293
+ "metadata": {},
294
+ "outputs": [
295
+ {
296
+ "name": "stdout",
297
+ "output_type": "stream",
298
+ "text": [
299
+ "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\chuong 8 NHTM.pptx\n"
300
+ ]
301
+ }
302
+ ],
303
+ "source": [
304
+ "download_input_from_mongodb(file_id=\"67dd7148972b1aa4dc9fb83d\", save_name=\"chuong 8 NHTM.pptx\", db_name=\"ppt\", collection_name=\"root_file\")"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 28,
310
+ "metadata": {},
311
+ "outputs": [
312
+ {
313
+ "name": "stdout",
314
+ "output_type": "stream",
315
+ "text": [
316
+ "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\chuong 8 NHTM.pptx\n"
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "download_output_from_mongodb(file_id=\"67dd717f972b1aa4dc9fb84f\", save_name=\"chuong 8 NHTM.pptx\", db_name=\"ppt\", collection_name=\"final_pptx\")"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": null,
327
  "metadata": {},
328
  "outputs": [],
329
  "source": [
 
336
  " :param db_name: Tên database MongoDB\n",
337
  " :param collection_name: Tên collection GridFS\n",
338
  " \"\"\"\n",
339
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
340
  " db = client[db_name]\n",
341
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
342
  "\n",
 
400
  " :return: ID của file XML trong MongoDB (original_xml)\n",
401
  " \"\"\"\n",
402
  " # Kết nối MongoDB\n",
403
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
404
  " db = client[db_name]\n",
405
  "\n",
406
  " fs_ppt = gridfs.GridFS(db, collection=\"original_pptx\") # PPT gốc\n",
 
449
  },
450
  {
451
  "cell_type": "code",
452
+ "execution_count": null,
453
  "metadata": {},
454
  "outputs": [],
455
  "source": [
 
464
  " :return: Dictionary {slide_number: [text1, text2, ...]}\n",
465
  " \"\"\"\n",
466
  " # Kết nối MongoDB\n",
467
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
468
  " db = client[db_name]\n",
469
  " fs = gridfs.GridFS(db, collection=collection_name)\n",
470
  "\n",
 
530
  },
531
  {
532
  "cell_type": "code",
533
+ "execution_count": null,
534
  "metadata": {},
535
  "outputs": [],
536
  "source": [
 
543
  " :param db_name: Tên database MongoDB\n",
544
  " \"\"\"\n",
545
  " # Kết nối MongoDB\n",
546
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
547
  " db = client[db_name]\n",
548
  " \n",
549
  " fs_original = gridfs.GridFS(db, collection=\"original_xml\") # Lấy file từ original_xml\n",
 
702
  "name": "stdout",
703
  "output_type": "stream",
704
  "text": [
705
+ "✅ Đã xóa 4 file trong collection 'root_file'\n",
706
  "✅ Đã xóa 1 file trong collection 'final_pptx'\n",
707
+ "✅ Đã xóa 1 file trong collection 'original_xml'\n",
708
+ "✅ Đã xóa 1 file trong collection 'final_xml'\n"
709
  ]
710
  }
711
  ],
 
714
  " delete_all_files_in_collection(i)"
715
  ]
716
  },
717
+ {
718
+ "cell_type": "code",
719
+ "execution_count": 17,
720
+ "metadata": {},
721
+ "outputs": [
722
+ {
723
+ "name": "stdout",
724
+ "output_type": "stream",
725
+ "text": [
726
+ "✅ Đã xóa 5 file trong collection 'root_file'\n",
727
+ "✅ Đã xóa 2 file trong collection 'final_file'\n"
728
+ ]
729
+ }
730
+ ],
731
+ "source": [
732
+ "for i in ['root_file', 'final_file']:\n",
733
+ " delete_all_files_in_collection(i, db_name=\"excel\")"
734
+ ]
735
+ },
736
  {
737
  "cell_type": "code",
738
  "execution_count": 19,
 
752
  },
753
  {
754
  "cell_type": "code",
755
+ "execution_count": 7,
756
  "metadata": {},
757
  "outputs": [],
758
  "source": [
759
  "def file_list(collection=\"root_file\"):\n",
760
+ " client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
761
+ " db = client[\"ppt\"]\n",
762
  " fs = gridfs.GridFS(db, collection=collection)\n",
763
  " for file in fs.find():\n",
764
  " print(f\"📂 File: {file.filename} - ID: {file._id}\")"
 
773
  },
774
  {
775
  "cell_type": "code",
776
+ "execution_count": 8,
777
  "metadata": {},
778
  "outputs": [
779
  {
780
  "name": "stdout",
781
  "output_type": "stream",
782
  "text": [
783
+ "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcd8c575cfef63155d3f91\n",
784
+ "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcea4f02257ad0cb04610e\n",
785
+ "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcead0143da29a5c6321ab\n",
786
+ "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dd3bf23cf7ee2f6eca902e\n"
787
  ]
788
  }
789
  ],
word/word_translate.py CHANGED
@@ -1,60 +1,97 @@
1
- import os
2
  import docx
3
  from docx import Document
4
  import google.generativeai as genai
5
  import ast
6
  import json
7
- from docx.oxml import OxmlElement
8
- from copy import deepcopy
 
9
  import io
 
10
  from pymongo import MongoClient
11
  from gridfs import GridFS
12
  from docx import Document
13
- from deep_translator import GoogleTranslator
14
 
15
- gemini_api = "AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg"
16
- target_language = 'vi'
17
- source_language = 'en'
 
18
 
19
- def batch_translate(texts, source_lang = 'en', target_lang="fr"):
20
  """ Translates multiple text segments in a single API call. """
21
  if not texts:
22
  return texts # Skip if empty
23
-
24
- prompt = f"""
25
- Translate the following JSON file from {source_lang} into {target_lang} while preserving names, links, symbols, and formatting:
26
- {json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])}
27
-
28
  - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
29
  - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
 
 
 
 
 
 
 
 
 
30
  - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
31
- - Return only valid JSON a Python array of translated objects.
32
- - If the original array is empty, return an empty array.
 
 
 
 
 
 
 
 
33
  """
34
-
35
- client = genai.Client(api_key=gemini_api)
36
- response = client.models.generate_content(
37
- model="gemini-2.0-flash", contents=prompt)
38
-
39
- translated_output = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
40
-
41
- return [item["text"] for item in translated_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def merge_runs(runs):
44
  """ Merges adjacent runs with the same style. """
45
  merged_runs = []
46
  for run in runs:
47
- if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run):
48
- if (
49
- merged_runs and
50
  run.style == merged_runs[-1].style and
51
  merged_runs[-1].bold == run.bold and
52
  merged_runs[-1].italic == run.italic and
53
  merged_runs[-1].underline == run.underline and
54
  merged_runs[-1].font.size == run.font.size and
55
  merged_runs[-1].font.color.rgb == run.font.color.rgb and
56
- merged_runs[-1].font.name == run.font.name
57
- ):
58
  merged_runs[-1].text += run.text
59
  else:
60
  merged_runs.append(run)
@@ -62,146 +99,7 @@ def merge_runs(runs):
62
 
63
  NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
64
 
65
- def translate_paragraphs(doc, source_lang, target_lang):
66
- paragraphs = []
67
- for para in doc.paragraphs:
68
- for run in merge_runs(para.iter_inner_content()):
69
- if isinstance(run, docx.text.run.Run):
70
- paragraphs.append(run.text)
71
- # paragraphs = merge_runs(paragraphs)
72
- translated_paragraphs = []
73
- temp_batch = []
74
- words = 0
75
- for para in paragraphs:
76
- if len(para) + words > 5000:
77
- translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
78
- temp_batch = []
79
- words = 0
80
- words += len(para)
81
- temp_batch.append(para)
82
- translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
83
- # translated_paragraphs = batch_translate(paragraphs, target_lang)
84
-
85
- if len(translated_paragraphs) > 0:
86
- # Replace translated text back
87
- para_index = 0
88
- for para in doc.paragraphs:
89
- original_para = deepcopy(para)
90
- para.clear() # Remove text while keeping paragraph properties
91
- for run in merge_runs(original_para.iter_inner_content()):
92
- if isinstance(run, docx.text.run.Run):
93
- translated_text = translated_paragraphs[para_index]
94
- try:
95
- translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
96
- except UnicodeEncodeError:
97
- translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
98
- drawing = run._element.find(f".//{NS_W}drawing")
99
- pict = run._element.find(".//{NS_W}pict")
100
-
101
- # Create a new run with translated text and copy the formatting
102
- new_run = para.add_run(translated_text)
103
- new_run.style = run.style
104
-
105
- if drawing is not None:
106
- new_run._element.append(drawing)
107
- elif pict is not None:
108
- new_run._element.append(pict)
109
-
110
- # Copy formatting from original run
111
- new_run.bold = run.bold
112
- new_run.italic = run.italic
113
- new_run.underline = run.underline
114
- new_run.font.size = run.font.size
115
- new_run.font.color.rgb = run.font.color.rgb
116
- new_run.font.name = run.font.name
117
- para_index += 1
118
- elif isinstance(run, docx.text.hyperlink.Hyperlink):
119
- parent = run._element
120
- tag = parent.tag.split("}")[-1]
121
-
122
- # Create a new hyperlink element with the correct namespace
123
- new_hyperlink = OxmlElement(f"w:{tag}")
124
- for attr in parent.attrib:
125
- new_hyperlink.set(attr, parent.get(attr))
126
- for child in parent:
127
- new_hyperlink.append(child)
128
- para._element.append(new_hyperlink)
129
-
130
-
131
- def translate_tables(doc, source_lang, target_lang):
132
- table_texts = []
133
- run_mapping = {}
134
-
135
-
136
- for table in doc.tables:
137
- for row in table.rows:
138
- for cell in row.cells:
139
- for para in cell.paragraphs:
140
- for run in merge_runs(para.iter_inner_content()):
141
- if isinstance(run, docx.text.run.Run):
142
- table_texts.append(run.text)
143
-
144
- translated_tables = []
145
- temp_batch = []
146
- words = 0
147
- for para in table_texts:
148
- if len(para) + words > 5000:
149
- translated_tables += batch_translate(temp_batch, source_lang, target_lang)
150
- temp_batch = []
151
- words = 0
152
- words += len(para)
153
- temp_batch.append(para)
154
- translated_tables += batch_translate(temp_batch, source_lang, target_lang)
155
- # translated_tables = batch_translate(table_texts, target_lang)
156
-
157
- if len(translated_tables) > 0:
158
- table_index = 0
159
- for table in doc.tables:
160
- for row in table.rows:
161
- for cell in row.cells:
162
- for para in cell.paragraphs:
163
- original_para = deepcopy(para)
164
- para.clear() # Remove text while keeping paragraph properties
165
- for run in merge_runs(original_para.iter_inner_content()):
166
- if isinstance(run, docx.text.run.Run):
167
- translated_text = translated_tables[table_index]
168
- try:
169
- translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
170
- except UnicodeEncodeError:
171
- translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
172
- drawing = run._element.find(f".//{NS_W}drawing")
173
- pict = run._element.find(".//{NS_W}pict")
174
-
175
- # Create a new run with translated text and copy the formatting
176
- new_run = para.add_run(translated_text)
177
- new_run.style = run.style
178
-
179
- if drawing is not None:
180
- new_run._element.append(drawing)
181
- elif pict is not None:
182
- new_run._element.append(pict)
183
-
184
- # Copy formatting from original run
185
- new_run.bold = run.bold
186
- new_run.italic = run.italic
187
- new_run.underline = run.underline
188
- new_run.font.size = run.font.size
189
- new_run.font.color.rgb = run.font.color.rgb
190
- new_run.font.name = run.font.name
191
- table_index += 1
192
- elif isinstance(run, docx.text.hyperlink.Hyperlink):
193
- parent = run._element
194
- tag = parent.tag.split("}")[-1]
195
-
196
- # Create a new hyperlink element with the correct namespace
197
- new_hyperlink = OxmlElement(f"w:{tag}")
198
- for attr in parent.attrib:
199
- new_hyperlink.set(attr, parent.get(attr))
200
- for child in parent:
201
- new_hyperlink.append(child)
202
- para._element.append(new_hyperlink)
203
-
204
- def translate_header_footer(doc, source_lang, target_lang):
205
  head_foot = []
206
  for section in doc.sections:
207
  for header in section.header.paragraphs:
@@ -210,7 +108,7 @@ def translate_header_footer(doc, source_lang, target_lang):
210
  for footer in section.footer.paragraphs:
211
  for run in footer.runs:
212
  head_foot.append(run.text)
213
- translated_head_foot = batch_translate(head_foot, source_lang, target_lang)
214
 
215
  i = 0
216
  for section in doc.sections:
@@ -222,25 +120,85 @@ def translate_header_footer(doc, source_lang, target_lang):
222
  for run in footer.runs:
223
  run.text = translated_head_foot[i]
224
  i += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- def translate_docx(file_id, source_lang='en', target_lang='fr', db_name='word'):
227
- client = MongoClient('mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0')
228
- db = client[db_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  fs_input = GridFS(db, collection="root_file")
230
  fs_output = GridFS(db, collection="final_file")
231
 
 
232
  file_data = fs_input.get(file_id).read()
233
- input_doc = Document(io.BytesIO(file_data))
 
 
 
 
 
 
 
 
234
 
235
- translate_paragraphs(input_doc, source_lang, target_lang)
236
- translate_tables(input_doc, source_lang, target_lang)
237
- translate_header_footer(input_doc, source_lang, target_lang)
 
238
 
 
239
  output_stream = io.BytesIO()
240
- input_doc.save(output_stream)
241
  output_stream.seek(0)
242
 
243
- translated_file_id = fs_output.put(output_stream, filename=f"{target_lang}_translated.docx")
244
- print(f"Translation complete! Saved with file ID: {translated_file_id}")
245
 
246
- return translated_file_id
 
 
1
  import docx
2
  from docx import Document
3
  import google.generativeai as genai
4
  import ast
5
  import json
6
+ import re
7
+ import dotenv
8
+ import os
9
  import io
10
+
11
  from pymongo import MongoClient
12
  from gridfs import GridFS
13
  from docx import Document
 
14
 
15
+ dotenv.load_dotenv(".env")
16
+ api_key = os.getenv("GEMINI_API_KEY")
17
+ genai.configure(api_key=api_key)
18
+ model = genai.GenerativeModel("gemini-2.0-flash")
19
 
20
+ def batch_translate(texts, target_lang="Vietnamese"):
21
  """ Translates multiple text segments in a single API call. """
22
  if not texts:
23
  return texts # Skip if empty
24
+
25
+ system_prompt = """ You are given three inputs: source language, target language and a json file.
26
+ - Your task is to translate the JSON file from the source language (you have to detect the source language yourself) to the target language.
 
 
27
  - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
28
  - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
29
+ - The array contains text that makes up whole paragraphs. Make sure that the translation makes sense when the text is put together and retains the same context.
30
+ - This is very important: Empty spaces should be left as is. For example: From English, "Hello " should be translated into Vietnamese as "Xin chào ", with the same space at the end.
31
+ - Very frequently there are spaces before or after a string. Do not remove these spaces.
32
+ - If the source language is English and the target language is Vietnamese and a string contains "'s" in the possessive sense, translate it as "của".
33
+ - Example: [["WorldQuant's"], ["Mission"]] should be translated as [["Nhiệm vụ"], ["của WorldQuant"]]
34
+ - Words might be split into multiple continuous arrays. Translate them such that the translation corresponds to the full word.
35
+ - If a word is split up into multiple arrays, the translation should be such that the word is not split up.
36
+ - Exampe: ['Tesla sells its pro', 'ducts'] should be translated as ['Tesla bán sản phẩm của mình', ''.]. Note that the number of elements in the output is the same as the input.
37
+ - Example: [["Hello"], ["World"]] should be translated as [["Xin chào"], ["Thế giới"]]
38
  - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
39
+ - Return a JSON object that is a Python array.
40
+ - Each object in the array is a dictionary with two keys: "index" and "text".
41
+ - The text should be the translated version of the text in the original object, and the index should stay consistent.
42
+ - The number of objects in the output MUST the same as the number of objects in the input.
43
+ - The format of the output should look exactly like the example.
44
+ - Example:
45
+ **Input**: Target language: Vietnamese. JSON file:
46
+ [{"index": 0, "text": ["My name is "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 3, "text": ["Today is "]}, {"index": 4, "text": ["a"]}, {"index": 5, "text": ["good day"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]'
47
+ **Output**: [{"index": 0, "text": ["Tên tôi là "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 4, "text": ["Hôm nay là "]}, {"index": 3, "text": ["Một"]}, {"index": 5, "text": ["ngày đẹp"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]
48
+ - Return the result of translation according to the format. Do NOT return code for translating.
49
  """
50
+ json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
51
+ user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
52
+
53
+ model = genai.GenerativeModel('gemini-2.0-flash')
54
+ response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
55
+ 'temperature': 1, # Adjust temperature for desired creativity
56
+ 'top_p': 1,
57
+ 'top_k': 1,})
58
+ response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
59
+ if len(response_dict) > 0:
60
+ if isinstance(response_dict[0]['text'], list):
61
+ translated_texts = [i['text'][0] for i in sorted(response_dict, key = lambda x: x['index'])]
62
+ elif isinstance(response_dict[0]['text'], str):
63
+ translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
64
+ return translated_texts
65
+
66
+ def full_translate(texts, target_lang="Vietnamese"):
67
+ full_translated_texts = []
68
+ batch = []
69
+ word_count = 0
70
+
71
+ for string in texts:
72
+ if len(string.split()) + word_count >= 1000:
73
+ print('Translating a batch.')
74
+ full_translated_texts += batch_translate(batch, target_lang)
75
+ batch = []
76
+ word_count = 0
77
+ batch.append(string)
78
+ word_count += len(string.split())
79
+
80
+ full_translated_texts += batch_translate(batch, target_lang)
81
+ return full_translated_texts
82
 
83
  def merge_runs(runs):
84
  """ Merges adjacent runs with the same style. """
85
  merged_runs = []
86
  for run in runs:
87
+ if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and
 
 
88
  run.style == merged_runs[-1].style and
89
  merged_runs[-1].bold == run.bold and
90
  merged_runs[-1].italic == run.italic and
91
  merged_runs[-1].underline == run.underline and
92
  merged_runs[-1].font.size == run.font.size and
93
  merged_runs[-1].font.color.rgb == run.font.color.rgb and
94
+ merged_runs[-1].font.name == run.font.name):
 
95
  merged_runs[-1].text += run.text
96
  else:
97
  merged_runs.append(run)
 
99
 
100
  NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
101
 
102
+ def translate_header_footer(doc, target_lang):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  head_foot = []
104
  for section in doc.sections:
105
  for header in section.header.paragraphs:
 
108
  for footer in section.footer.paragraphs:
109
  for run in footer.runs:
110
  head_foot.append(run.text)
111
+ translated_head_foot = full_translate(head_foot, target_lang)
112
 
113
  i = 0
114
  for section in doc.sections:
 
120
  for run in footer.runs:
121
  run.text = translated_head_foot[i]
122
  i += 1
123
+
124
+ def get_text_elements_para(doc):
125
+ para_texts = []
126
+ for para in doc.paragraphs:
127
+ for element in para._element.iter():
128
+ if element.tag.endswith('t'):
129
+ if element.text:
130
+ emoji_pattern = r'[\U00010000-\U0010FFFF]'
131
+ # Split the text but keep emojis as separate elements
132
+ parts = re.split(f'({emoji_pattern})', element.text)
133
+ for part in parts:
134
+ if re.match(emoji_pattern, part):
135
+ continue
136
+ para_texts.append(part)
137
+ return para_texts
138
+
139
+ def get_text_elements_table(doc):
140
+ table_texts = []
141
+ for table in doc.tables:
142
+ for row in table.rows:
143
+ for cell in row.cells:
144
+ table_texts += get_text_elements_para(cell)
145
+ return table_texts
146
 
147
+ def translate_paragraphs(doc, translated_texts, i = 0):
148
+ for para in doc.paragraphs:
149
+ for element in para._element.iter():
150
+ if element.tag.endswith('t'):
151
+ if element.text:
152
+ emoji_pattern = r'[\U00010000-\U0010FFFF]'
153
+ # Split the text but keep emojis as separate elements
154
+ parts = re.split(f'({emoji_pattern})', element.text)
155
+ for j in range(len(parts)):
156
+ if re.match(emoji_pattern, parts[j]):
157
+ continue
158
+ translated_text = translated_texts[i]
159
+ i += 1
160
+ parts[j] = translated_text
161
+ element.text = "".join(parts)
162
+ return doc, i
163
+
164
+ def translate_tables(doc, translated_texts):
165
+ i = 0
166
+ for table in doc.tables:
167
+ for row in table.rows:
168
+ for cell in row.cells:
169
+ cell, i = translate_paragraphs(cell, translated_texts, i)
170
+ return doc
171
+
172
+ def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
173
+ # Kết nối MongoDB
174
+ client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
175
+ db = client["word"]
176
  fs_input = GridFS(db, collection="root_file")
177
  fs_output = GridFS(db, collection="final_file")
178
 
179
+ # Lấy file từ MongoDB
180
  file_data = fs_input.get(file_id).read()
181
+ original_file = fs_input.get(file_id).filename # Lấy tên gốc của file
182
+ doc = Document(io.BytesIO(file_data))
183
+
184
+ # Lấy nội dung và dịch
185
+ para_texts = get_text_elements_para(doc)
186
+ translated_para = full_translate(para_texts, target_lang)
187
+
188
+ table_texts = get_text_elements_table(doc)
189
+ translated_tables = full_translate(table_texts, target_lang)
190
 
191
+ # Cập nhật nội dung dịch vào document
192
+ doc, _ = translate_paragraphs(doc, translated_para)
193
+ doc = translate_tables(doc, translated_tables)
194
+ translate_header_footer(doc, target_lang)
195
 
196
+ # Lưu file dịch vào MongoDB với cùng tên gốc
197
  output_stream = io.BytesIO()
198
+ doc.save(output_stream)
199
  output_stream.seek(0)
200
 
201
+ translated_file_id = fs_output.put(output_stream, filename=original_file)
202
+ client.close()
203
 
204
+ return translated_file_id