Spaces:
Running
Running
Add application file
Browse files- .env +1 -0
- README.md +2 -12
- db/mongodb.py +194 -0
- excel/excel_translate.py +174 -0
- home.py +22 -0
- pages/upload.py +134 -0
- powerpoint/__init__.py +0 -0
- powerpoint/pptx_object.py +357 -0
- powerpoint/pptx_processor.py +50 -0
- powerpoint/xml_handling.py +368 -0
- test.ipynb +0 -0
- translate/translator.py +64 -0
- word/word_translate.py +246 -0
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
GEMINI_API_KEY = AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg
|
README.md
CHANGED
@@ -1,13 +1,3 @@
|
|
1 |
-
|
2 |
-
title: MT Deploy
|
3 |
-
emoji: 🐠
|
4 |
-
colorFrom: green
|
5 |
-
colorTo: green
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.43.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
short_description: deploy Machine Translation
|
11 |
-
---
|
12 |
|
13 |
-
|
|
|
1 |
+
# Machine-Translation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
- Link drive: https://drive.google.com/drive/folders/19htOXYBz88eNIWU0-_3xEn1JRU-JaIvW?usp=drive_link
|
db/mongodb.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pymongo import MongoClient
|
2 |
+
import gridfs
|
3 |
+
from bson import ObjectId
|
4 |
+
import os
|
5 |
+
from io import BytesIO
|
6 |
+
import magic
|
7 |
+
|
8 |
+
def connect_mongodb(db_name, collection_name):
|
9 |
+
client = MongoClient("mongodb://localhost:27017")
|
10 |
+
db = client[db_name]
|
11 |
+
fs = gridfs.GridFS(db, collection=collection_name)
|
12 |
+
return fs
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file", file_name=None, file_tail=".pptx"):
|
17 |
+
"""
|
18 |
+
Lưu file PowerPoint (pptx) vào MongoDB bằng GridFS
|
19 |
+
nhưng không lưu nếu tên file đã tồn tại.
|
20 |
+
|
21 |
+
:param uploaded_file: đối tượng UploadedFile từ Streamlit
|
22 |
+
:param db_name: Tên database trong MongoDB
|
23 |
+
:param collection_name: Tên collection GridFS
|
24 |
+
:param file_name: Tên file muốn lưu (không cần .pptx). Nếu để None, lấy tên gốc.
|
25 |
+
:return: file_id nếu lưu thành công, None nếu file đã tồn tại
|
26 |
+
"""
|
27 |
+
client = MongoClient("mongodb://localhost:27017/")
|
28 |
+
db = client[db_name]
|
29 |
+
fs = gridfs.GridFS(db, collection=collection_name)
|
30 |
+
|
31 |
+
# Xác định tên file
|
32 |
+
if not file_name:
|
33 |
+
# Lấy tên file từ uploaded_file (VD: "slide.pptx")
|
34 |
+
file_name = uploaded_file.name
|
35 |
+
else:
|
36 |
+
# Nếu người dùng chỉ truyền tên, thêm .pptx nếu chưa có
|
37 |
+
if not file_name.endswith(file_tail):
|
38 |
+
file_name = file_name + file_tail
|
39 |
+
|
40 |
+
# Kiểm tra file đã tồn tại trong MongoDB chưa
|
41 |
+
existing_file = fs.find_one({"filename": file_name})
|
42 |
+
if existing_file:
|
43 |
+
print(f"⚠️ File '{file_name}' đã tồn tại trong MongoDB. Không lưu lại. Xin vui lòng đổi tên.")
|
44 |
+
client.close()
|
45 |
+
return None
|
46 |
+
|
47 |
+
# Đảm bảo con trỏ file đang ở đầu
|
48 |
+
uploaded_file.seek(0)
|
49 |
+
file_bytes = uploaded_file.read()
|
50 |
+
|
51 |
+
# Lưu nội dung file (bytes) vào MongoDB
|
52 |
+
file_id = fs.put(file_bytes, filename=file_name)
|
53 |
+
print(f"✅ File '{file_name}' đã được lưu vào '{collection_name}' với ID: {file_id}")
|
54 |
+
client.close()
|
55 |
+
return file_id
|
56 |
+
|
57 |
+
def delete_pptx_from_mongodb(file_id, db_name="ppt", collection_name="root_file"):
|
58 |
+
"""
|
59 |
+
Xóa file PowerPoint khỏi MongoDB theo ID.
|
60 |
+
|
61 |
+
:param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)
|
62 |
+
:param db_name: Tên database trong MongoDB
|
63 |
+
:param collection_name: Tên collection GridFS
|
64 |
+
"""
|
65 |
+
# Kết nối đến MongoDB
|
66 |
+
client = MongoClient("mongodb://localhost:27017/")
|
67 |
+
db = client[db_name]
|
68 |
+
fs = gridfs.GridFS(db, collection=collection_name)
|
69 |
+
|
70 |
+
try:
|
71 |
+
# Chuyển đổi ID nếu cần
|
72 |
+
if not isinstance(file_id, ObjectId):
|
73 |
+
file_id = ObjectId(file_id)
|
74 |
+
|
75 |
+
# Kiểm tra file có tồn tại không
|
76 |
+
if fs.exists(file_id):
|
77 |
+
fs.delete(file_id)
|
78 |
+
print(f"✅ Đã xóa file với ID: {file_id}")
|
79 |
+
else:
|
80 |
+
print(f"⚠️ Không tìm thấy file với ID: {file_id}")
|
81 |
+
except Exception as e:
|
82 |
+
print(f"❌ Lỗi khi xóa file: {e}")
|
83 |
+
|
84 |
+
client.close()
|
85 |
+
|
86 |
+
def download_pptx_from_mongodb(file_id, save_path, save_name, db_name="ppt", collection_name="root_file"):
|
87 |
+
"""
|
88 |
+
Tải file PowerPoint từ MongoDB GridFS và lưu về máy.
|
89 |
+
|
90 |
+
:param file_id: ID của file cần tải (dạng chuỗi hoặc ObjectId)
|
91 |
+
:param save_path: Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')
|
92 |
+
:param save_name: Tên file khi lưu (VD: 'my_presentation.pptx')
|
93 |
+
:param db_name: Tên database trong MongoDB (mặc định: 'ppt')
|
94 |
+
:param collection_name: Tên collection GridFS (mặc định: 'root_file')
|
95 |
+
"""
|
96 |
+
# Đảm bảo thư mục lưu file tồn tại
|
97 |
+
os.makedirs(save_path, exist_ok=True)
|
98 |
+
|
99 |
+
# Tạo đường dẫn đầy đủ cho file
|
100 |
+
full_file_path = os.path.join(save_path, save_name)
|
101 |
+
|
102 |
+
# Kết nối đến MongoDB
|
103 |
+
client = MongoClient("mongodb://localhost:27017/")
|
104 |
+
db = client[db_name]
|
105 |
+
fs = gridfs.GridFS(db, collection=collection_name)
|
106 |
+
|
107 |
+
try:
|
108 |
+
# Chuyển đổi ID nếu cần
|
109 |
+
if not isinstance(file_id, ObjectId):
|
110 |
+
file_id = ObjectId(file_id)
|
111 |
+
|
112 |
+
# Lấy dữ liệu file từ GridFS
|
113 |
+
file_data = fs.get(file_id)
|
114 |
+
|
115 |
+
# Ghi dữ liệu ra file
|
116 |
+
with open(full_file_path, "wb") as f:
|
117 |
+
f.write(file_data.read())
|
118 |
+
|
119 |
+
print(f"✅ File đã được tải về: {full_file_path}")
|
120 |
+
except Exception as e:
|
121 |
+
print(f"❌ Lỗi khi tải file: {e}")
|
122 |
+
finally:
|
123 |
+
client.close()
|
124 |
+
|
125 |
+
def save_xml_to_gridfs(xml_content, file_name, db_name="ppt", collection_name="original_xml"):
|
126 |
+
"""
|
127 |
+
Lưu XML vào MongoDB GridFS.
|
128 |
+
|
129 |
+
:param xml_content: Chuỗi XML cần lưu
|
130 |
+
:param file_name: Tên file XML
|
131 |
+
:param db_name: Tên database MongoDB
|
132 |
+
:param collection_name: Tên collection GridFS
|
133 |
+
"""
|
134 |
+
client = MongoClient("mongodb://localhost:27017/")
|
135 |
+
db = client[db_name]
|
136 |
+
fs = gridfs.GridFS(db, collection=collection_name)
|
137 |
+
|
138 |
+
# Kiểm tra file đã tồn tại chưa
|
139 |
+
existing_file = fs.find_one({"filename": file_name})
|
140 |
+
if existing_file:
|
141 |
+
print(f"⚠️ File '{file_name}' đã tồn tại trong GridFS. Không lưu lại.")
|
142 |
+
return
|
143 |
+
|
144 |
+
# Chuyển đổi chuỗi XML thành bytes và lưu vào GridFS
|
145 |
+
file_id = fs.put(xml_content.encode("utf-8"), filename=file_name)
|
146 |
+
print(f"✅ XML '{file_name}' đã được lưu vào GridFS với ID: {file_id}")
|
147 |
+
|
148 |
+
def fetch_file_from_mongodb(db_name, collection_name, file_id):
|
149 |
+
client = MongoClient("mongodb://localhost:27017/") # Cập nhật nếu cần
|
150 |
+
db = client[db_name]
|
151 |
+
fs = gridfs.GridFS(db, collection_name)
|
152 |
+
|
153 |
+
try:
|
154 |
+
file_data = fs.get(file_id)
|
155 |
+
pptx_io = BytesIO(file_data.read())
|
156 |
+
pptx_io.seek(0) # Đặt lại vị trí đầu file
|
157 |
+
return pptx_io, file_data.filename
|
158 |
+
except Exception as e:
|
159 |
+
print(f"Lỗi khi lấy file từ MongoDB: {e}")
|
160 |
+
return None, None
|
161 |
+
|
162 |
+
def detect_file_type(uploaded_file):
|
163 |
+
if uploaded_file is not None:
|
164 |
+
try:
|
165 |
+
file_bytes = uploaded_file.read(4096) # Đọc nhiều bytes hơn để nhận diện MIME
|
166 |
+
mime = magic.Magic(mime=True)
|
167 |
+
file_type = mime.from_buffer(file_bytes)
|
168 |
+
except Exception as e:
|
169 |
+
print(f"Error detecting file type: {e}")
|
170 |
+
file_type = "Unknown"
|
171 |
+
|
172 |
+
# Danh sách MIME types phổ biến
|
173 |
+
mime_types = {
|
174 |
+
"application/pdf": "PDF",
|
175 |
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "PPTX",
|
176 |
+
"application/vnd.ms-powerpoint": "PPTX",
|
177 |
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel",
|
178 |
+
"application/vnd.ms-excel": "Excel",
|
179 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word",
|
180 |
+
"application/msword": "Word",
|
181 |
+
"text/csv": "CSV",
|
182 |
+
"text/plain": "CSV" # Một số file CSV có thể nhận diện là text/plain
|
183 |
+
}
|
184 |
+
|
185 |
+
detected_type = mime_types.get(file_type, "Unknown")
|
186 |
+
|
187 |
+
# Nếu vẫn không chắc, kiểm tra phần mở rộng file
|
188 |
+
if detected_type == "Unknown":
|
189 |
+
ext = os.path.splitext(uploaded_file.name)[1].lower()
|
190 |
+
ext_mapping = {".csv": "CSV", ".docx": "Word", ".doc": "Word", ".xlsx": "Excel", ".pptx": "PPTX", ".pdf": "PDF"}
|
191 |
+
detected_type = ext_mapping.get(ext, "Unknown")
|
192 |
+
|
193 |
+
return detected_type
|
194 |
+
return None
|
excel/excel_translate.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import xlwings as xw
|
2 |
+
from typing import Dict, List
|
3 |
+
from translate.translator import translate_text_dict
|
4 |
+
import math
|
5 |
+
import chardet
|
6 |
+
import io
|
7 |
+
import pandas as pd
|
8 |
+
import pymongo
|
9 |
+
import gridfs
|
10 |
+
from io import BytesIO
|
11 |
+
import tempfile
|
12 |
+
import os
|
13 |
+
|
14 |
+
def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en', target_lang: str = "fr", gemini_api: str = "", db_name: str = "excel"):
|
15 |
+
# Kết nối MongoDB
|
16 |
+
client = pymongo.MongoClient("mongodb://localhost:27017")
|
17 |
+
db = client[db_name]
|
18 |
+
fs_input = gridfs.GridFS(db, collection="root_file")
|
19 |
+
fs_output = gridfs.GridFS(db, collection="final_file")
|
20 |
+
|
21 |
+
# Tải file từ MongoDB
|
22 |
+
file_data = fs_input.get(file_id).read()
|
23 |
+
|
24 |
+
# Lưu file tạm thời
|
25 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
|
26 |
+
temp_file.write(file_data)
|
27 |
+
temp_file_path = temp_file.name
|
28 |
+
|
29 |
+
# Khởi tạo xlwings (chạy ẩn ứng dụng Excel)
|
30 |
+
app = xw.App(visible=False)
|
31 |
+
wb = xw.Book(temp_file_path) # Mở workbook từ file tạm
|
32 |
+
|
33 |
+
# Chọn sheet được chỉ định hoặc tất cả các sheet
|
34 |
+
sheets = [wb.sheets[sheet_name]] if sheet_name else wb.sheets
|
35 |
+
|
36 |
+
for sheet in sheets:
|
37 |
+
last_row = sheet.used_range.rows.count
|
38 |
+
last_col = sheet.used_range.columns.count
|
39 |
+
|
40 |
+
# Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
|
41 |
+
text_dict: Dict[str, List[str]] = {}
|
42 |
+
cell_map: Dict[str, any] = {} # lưu mapping key -> cell object
|
43 |
+
|
44 |
+
for row in range(1, last_row + 1):
|
45 |
+
for col in range(1, last_col + 1):
|
46 |
+
cell = sheet.cells[row, col]
|
47 |
+
if isinstance(cell.value, str):
|
48 |
+
key = f"R{row}C{col}" # key theo dạng R{row}C{col}
|
49 |
+
text_dict[key] = [cell.value] # Lưu giá trị dưới dạng danh sách với 1 phần tử
|
50 |
+
cell_map[key] = cell
|
51 |
+
|
52 |
+
# Gọi hàm dịch theo dạng bulk
|
53 |
+
translated_dict = translate_text_dict(text_dict, source_lang=from_lang, target_lang=target_lang, gemini_api=gemini_api)
|
54 |
+
|
55 |
+
# Cập nhật lại các cell với nội dung đã dịch
|
56 |
+
for key, cell in cell_map.items():
|
57 |
+
if key in translated_dict:
|
58 |
+
translated_text_list = translated_dict[key]
|
59 |
+
if translated_text_list and len(translated_text_list) > 0:
|
60 |
+
cell.value = translated_text_list[0]
|
61 |
+
|
62 |
+
# Lưu workbook vào file tạm thời
|
63 |
+
wb.save(temp_file_path)
|
64 |
+
wb.close()
|
65 |
+
app.quit()
|
66 |
+
|
67 |
+
# Đọc lại file tạm để lưu vào MongoDB
|
68 |
+
with open(temp_file_path, "rb") as f:
|
69 |
+
translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.xlsx")
|
70 |
+
|
71 |
+
# Xóa file tạm
|
72 |
+
os.remove(temp_file_path)
|
73 |
+
|
74 |
+
print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
|
75 |
+
return translated_file_id
|
76 |
+
|
77 |
+
|
78 |
+
def read_csv_with_auto_encoding(csv_path):
|
79 |
+
# Đọc file dưới dạng nhị phân
|
80 |
+
with open(csv_path, "rb") as f:
|
81 |
+
raw_data = f.read()
|
82 |
+
# Dò tìm encoding
|
83 |
+
detect_result = chardet.detect(raw_data)
|
84 |
+
encoding = detect_result["encoding"]
|
85 |
+
confidence = detect_result["confidence"]
|
86 |
+
|
87 |
+
print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
|
88 |
+
|
89 |
+
# Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
|
90 |
+
if encoding is None:
|
91 |
+
encoding = "utf-8"
|
92 |
+
|
93 |
+
decoded_data = raw_data.decode(encoding, errors='replace')
|
94 |
+
|
95 |
+
# Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
|
96 |
+
csv_data = io.StringIO(decoded_data)
|
97 |
+
df = pd.read_csv(csv_data)
|
98 |
+
return df
|
99 |
+
|
100 |
+
|
101 |
+
def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
|
102 |
+
# Kết nối MongoDB
|
103 |
+
client = pymongo.MongoClient("mongodb://localhost:27017")
|
104 |
+
db = client[db_name]
|
105 |
+
fs_input = gridfs.GridFS(db, collection="root_file")
|
106 |
+
fs_output = gridfs.GridFS(db, collection="final_file")
|
107 |
+
|
108 |
+
# Tải file từ MongoDB
|
109 |
+
file_data = fs_input.get(file_id).read()
|
110 |
+
|
111 |
+
# Lưu file tạm thời
|
112 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
|
113 |
+
temp_file.write(file_data)
|
114 |
+
temp_file_path = temp_file.name
|
115 |
+
|
116 |
+
df = read_csv_with_auto_encoding(temp_file_path)
|
117 |
+
|
118 |
+
# If text_columns is not specified, we assume we want to translate everything that looks like text.
|
119 |
+
# Otherwise, only translate the given columns.
|
120 |
+
if text_columns is None:
|
121 |
+
# Example heuristic: choose all object/string columns
|
122 |
+
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
123 |
+
|
124 |
+
num_rows = len(df)
|
125 |
+
num_chunks = math.ceil(num_rows / chunk_size)
|
126 |
+
|
127 |
+
translated_df = df.copy() # copy to store the final translations
|
128 |
+
|
129 |
+
for chunk_index in range(num_chunks):
|
130 |
+
start_idx = chunk_index * chunk_size
|
131 |
+
end_idx = min((chunk_index + 1) * chunk_size, num_rows)
|
132 |
+
chunk_df = df.iloc[start_idx:end_idx]
|
133 |
+
|
134 |
+
# Build a dictionary structure. For example, row-based:
|
135 |
+
# {
|
136 |
+
# "0": {"colA": "some text", "colB": "some text"},
|
137 |
+
# "1": {"colA": "some text", "colB": "some text"},
|
138 |
+
# ...
|
139 |
+
# }
|
140 |
+
chunk_dict = {}
|
141 |
+
for i, row in chunk_df.iterrows():
|
142 |
+
row_dict = {}
|
143 |
+
for col in text_columns:
|
144 |
+
row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
|
145 |
+
chunk_dict[str(i)] = row_dict
|
146 |
+
|
147 |
+
# Now call your LLM translator on this dictionary
|
148 |
+
translated_chunk = translate_text_dict(
|
149 |
+
text_dict=chunk_dict,
|
150 |
+
source_lang=source_lang,
|
151 |
+
target_lang=target_lang,
|
152 |
+
gemini_api=gemini_api
|
153 |
+
)
|
154 |
+
|
155 |
+
# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
|
156 |
+
for i_str, row_data in translated_chunk.items():
|
157 |
+
i = int(i_str)
|
158 |
+
for col, translated_val in row_data.items():
|
159 |
+
translated_df.at[i, col] = translated_val
|
160 |
+
|
161 |
+
# Lưu file dịch vào tệp tạm thời
|
162 |
+
translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
|
163 |
+
translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
|
164 |
+
|
165 |
+
# Đọc lại file tạm để lưu vào MongoDB
|
166 |
+
with open(translated_file_path, "rb") as f:
|
167 |
+
translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
|
168 |
+
|
169 |
+
# Xóa file tạm
|
170 |
+
os.remove(temp_file_path)
|
171 |
+
os.remove(translated_file_path)
|
172 |
+
|
173 |
+
print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
|
174 |
+
return translated_file_id
|
home.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
st.title("Some Streamlit Demo, maybe")
|
6 |
+
|
7 |
+
st.sidebar.header("Input")
|
8 |
+
num_rows = st.sidebar.slider("Number of rows", min_value=10, max_value=100, value=20)
|
9 |
+
num_cols = st.sidebar.slider("Number of columns", min_value=2, max_value=10, value=3)
|
10 |
+
|
11 |
+
data = np.random.randn(num_rows, num_cols)
|
12 |
+
columns = [f"Column {i+1}" for i in range(num_cols)]
|
13 |
+
df = pd.DataFrame(data, columns=columns)
|
14 |
+
|
15 |
+
st.subheader("Generated Data Table")
|
16 |
+
st.dataframe(df)
|
17 |
+
|
18 |
+
st.subheader("Line Chart of the Data")
|
19 |
+
st.line_chart(df)
|
20 |
+
|
21 |
+
st.subheader("Statistics")
|
22 |
+
st.write(df.describe())
|
pages/upload.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import google.generativeai as genai
|
3 |
+
from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
|
4 |
+
from powerpoint.xml_handling import (
|
5 |
+
extract_text_from_xml, update_xml_with_translated_text_mongodb, ppt_to_xml_mongodb
|
6 |
+
)
|
7 |
+
from translate.translator import translate_text_dict
|
8 |
+
from powerpoint.pptx_object import create_translated_ppt
|
9 |
+
from excel.excel_translate import translate_xlsx, translate_csv
|
10 |
+
from word.word_translate import translate_docx
|
11 |
+
|
12 |
+
import dotenv
|
13 |
+
import os
|
14 |
+
|
15 |
+
dotenv.load_dotenv(".env")
|
16 |
+
|
17 |
+
# Cấu hình API key
|
18 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
19 |
+
genai.configure(api_key=api_key)
|
20 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
21 |
+
|
22 |
+
# Giao diện Streamlit
|
23 |
+
st.title("Upload PPTX to MongoDB")
|
24 |
+
|
25 |
+
uploaded_file = st.file_uploader("Chọn file PPTX", type=["pptx, excel, csv, docx"])
|
26 |
+
file_name_input = st.text_input("Tên file để lưu (không cần .pptx)", value="")
|
27 |
+
|
28 |
+
final_pptx_id = None # Biến lưu ID file sau khi xử lý
|
29 |
+
|
30 |
+
if uploaded_file is not None:
|
31 |
+
if st.button("Upload"):
|
32 |
+
file_type = detect_file_type(uploaded_file)
|
33 |
+
st.write(f"Detected file type: {file_type}")
|
34 |
+
if file_type == "PPTX":
|
35 |
+
|
36 |
+
file_id = save_file_to_mongodb(uploaded_file=uploaded_file, file_name=file_name_input)
|
37 |
+
st.write(f"File ID: {file_id}")
|
38 |
+
|
39 |
+
xml_file_id = ppt_to_xml_mongodb(file_id)
|
40 |
+
text_dict = extract_text_from_xml(file_id=xml_file_id)
|
41 |
+
translated_dict = translate_text_dict(text_dict, source_lang="VietNamese", target_lang="English", gemini_api=api_key)
|
42 |
+
|
43 |
+
final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
44 |
+
st.write(f"Final XML ID: {final_xml_id}")
|
45 |
+
|
46 |
+
# Lưu ID file PPTX cuối cùng
|
47 |
+
final_pptx_id = create_translated_ppt(
|
48 |
+
db_name="ppt", original_ppt_id=file_id,
|
49 |
+
translated_xml_id=final_xml_id, output_collection="final_pptx"
|
50 |
+
)
|
51 |
+
st.write(f"Final PPTX ID: {final_pptx_id}")
|
52 |
+
|
53 |
+
# Hiển thị ảnh slide trước khi tải xuống
|
54 |
+
if final_pptx_id:
|
55 |
+
st.write("✅ File đã sẵn sàng để tải xuống!")
|
56 |
+
|
57 |
+
pptx_io, pptx_filename = fetch_file_from_mongodb("ppt", "final_pptx", final_pptx_id)
|
58 |
+
|
59 |
+
if pptx_io:
|
60 |
+
# Nút tải file sau khi xem trước
|
61 |
+
st.download_button(
|
62 |
+
label="Click to Download",
|
63 |
+
data=pptx_io.getvalue(), # Chuyển thành bytes để tải về
|
64 |
+
file_name=pptx_filename,
|
65 |
+
mime="application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
66 |
+
)
|
67 |
+
else:
|
68 |
+
st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
elif file_type == "Excel":
|
73 |
+
file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="excel", collection_name="root_file", file_name=file_name_input, file_tail=".xlsx")
|
74 |
+
st.write(f"File ID: {file_id}")
|
75 |
+
|
76 |
+
final_id = translate_xlsx(file_id=file_id, from_lang="en", target_lang="vi", gemini_api=api_key)
|
77 |
+
st.write(f"Final Excel ID: {final_id}")
|
78 |
+
if final_id:
|
79 |
+
st.write("✅ File đã sẵn sàng để tải xuống!")
|
80 |
+
|
81 |
+
excel_io, excel_filename = fetch_file_from_mongodb("excel", "final_file", final_id)
|
82 |
+
|
83 |
+
if excel_io:
|
84 |
+
st.download_button(
|
85 |
+
label="Click to Download",
|
86 |
+
data=excel_io.getvalue(),
|
87 |
+
file_name=excel_filename,
|
88 |
+
mime="application/vnd.ms-excel"
|
89 |
+
)
|
90 |
+
else:
|
91 |
+
st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
|
92 |
+
elif file_type == "CSV":
|
93 |
+
file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="csv", collection_name="root_file", file_name=file_name_input, file_tail=".csv")
|
94 |
+
st.write(f"File ID: {file_id}")
|
95 |
+
|
96 |
+
final_id = translate_csv(file_id=file_id, source_lang="en", target_lang="vi", gemini_api=api_key)
|
97 |
+
st.write(f"Final CSV ID: {final_id}")
|
98 |
+
if final_id:
|
99 |
+
st.write("✅ File đã sẵn sàng để tải xuống!")
|
100 |
+
|
101 |
+
csv_io, csv_filename = fetch_file_from_mongodb("csv", "final_file", final_id)
|
102 |
+
|
103 |
+
if csv_io:
|
104 |
+
st.download_button(
|
105 |
+
label="Click to Download",
|
106 |
+
data=csv_io.getvalue(),
|
107 |
+
file_name=csv_filename,
|
108 |
+
mime="text/csv"
|
109 |
+
)
|
110 |
+
else:
|
111 |
+
st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
|
112 |
+
|
113 |
+
elif file_type == "Word":
|
114 |
+
file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file", file_name=file_name_input, file_tail=".docx")
|
115 |
+
st.write(f"File ID: {file_id}")
|
116 |
+
|
117 |
+
final_id = translate_docx(file_id=file_id, source_lang="en", target_lang="vi")
|
118 |
+
st.write(f"Final CSV ID: {final_id}")
|
119 |
+
if final_id:
|
120 |
+
st.write("✅ File đã sẵn sàng để tải xuống!")
|
121 |
+
|
122 |
+
docx_io, docx_filename = fetch_file_from_mongodb("word", "final_file", final_id)
|
123 |
+
|
124 |
+
if docx_io:
|
125 |
+
st.download_button(
|
126 |
+
label="Click to Download",
|
127 |
+
data=docx_io.getvalue(),
|
128 |
+
file_name=docx_filename,
|
129 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
130 |
+
)
|
131 |
+
else:
|
132 |
+
st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
|
133 |
+
|
134 |
+
|
powerpoint/__init__.py
ADDED
File without changes
|
powerpoint/pptx_object.py
ADDED
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ppt_objects.py
|
2 |
+
from pptx import Presentation
|
3 |
+
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
|
4 |
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
5 |
+
import xml.etree.ElementTree as ET
|
6 |
+
from pptx.util import Pt
|
7 |
+
from pptx.dml.color import RGBColor
|
8 |
+
import re
|
9 |
+
import json
|
10 |
+
|
11 |
+
from pymongo import MongoClient
|
12 |
+
from gridfs import GridFS
|
13 |
+
import json
|
14 |
+
import xml.etree.ElementTree as ET
|
15 |
+
from io import BytesIO
|
16 |
+
|
17 |
+
|
18 |
+
def apply_group_properties_recursive(shape, shape_index, parent_element):
|
19 |
+
"""Recursively applies properties to shapes within groups."""
|
20 |
+
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
21 |
+
group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
|
22 |
+
if group_element is not None:
|
23 |
+
for i, sub_shape in enumerate(shape.shapes):
|
24 |
+
apply_group_properties_recursive(sub_shape, i, group_element)
|
25 |
+
|
26 |
+
# Apply properties for sub-shapes WITHIN the group, based on their type.
|
27 |
+
if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
28 |
+
table_element = group_element.find(f".//table_element[@shape_index='{i}']")
|
29 |
+
if table_element: # Use a shorter name for clarity
|
30 |
+
props_element = table_element.find("properties")
|
31 |
+
if props_element is not None and props_element.text:
|
32 |
+
try:
|
33 |
+
table_data = json.loads(props_element.text)
|
34 |
+
apply_table_properties(sub_shape.table, table_data)
|
35 |
+
except (json.JSONDecodeError, KeyError) as e:
|
36 |
+
print(f"Error applying table properties (in group): {str(e)}")
|
37 |
+
|
38 |
+
elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
|
39 |
+
text_element = group_element.find(f".//text_element[@shape_index='{i}']")
|
40 |
+
if text_element: # Shorter name
|
41 |
+
props_element = text_element.find("properties")
|
42 |
+
if props_element is not None and props_element.text:
|
43 |
+
try:
|
44 |
+
shape_data = json.loads(props_element.text)
|
45 |
+
apply_shape_properties(sub_shape, shape_data)
|
46 |
+
except (json.JSONDecodeError, KeyError) as e:
|
47 |
+
print(f"Error applying shape properties (in group): {str(e)}")
|
48 |
+
|
49 |
+
def get_alignment_value(alignment_str):
|
50 |
+
"""Convert alignment string (with extra characters) to PP_ALIGN enum value."""
|
51 |
+
alignment_map = {
|
52 |
+
'center': PP_ALIGN.CENTER,
|
53 |
+
'left': PP_ALIGN.LEFT,
|
54 |
+
'right': PP_ALIGN.RIGHT,
|
55 |
+
'justify': PP_ALIGN.JUSTIFY
|
56 |
+
}
|
57 |
+
match = re.match(r"([A-Za-z]+)", alignment_str)
|
58 |
+
return alignment_map.get(match.group(1).lower()) if match else None
|
59 |
+
|
60 |
+
def get_vertical_anchor(value):
|
61 |
+
"""Converts vertical_anchor string to MSO_ANCHOR enum."""
|
62 |
+
mapping = {
|
63 |
+
"TOP": MSO_ANCHOR.TOP,
|
64 |
+
"MIDDLE": MSO_ANCHOR.MIDDLE,
|
65 |
+
"BOTTOM": MSO_ANCHOR.BOTTOM
|
66 |
+
}
|
67 |
+
return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
|
68 |
+
|
69 |
+
def get_table_properties(table):
|
70 |
+
"""Extract complete table properties."""
|
71 |
+
table_data = {
|
72 |
+
'rows': len(table.rows),
|
73 |
+
'cols': len(table.columns),
|
74 |
+
'cells': []
|
75 |
+
}
|
76 |
+
for row in table.rows:
|
77 |
+
row_data = []
|
78 |
+
for cell in row.cells:
|
79 |
+
cell_data = {
|
80 |
+
'text': cell.text.strip(),
|
81 |
+
'font_size': None,
|
82 |
+
'font_name': None,
|
83 |
+
'alignment': None,
|
84 |
+
'margin_left': cell.margin_left,
|
85 |
+
'margin_right': cell.margin_right,
|
86 |
+
'margin_top': cell.margin_top,
|
87 |
+
'margin_bottom': cell.margin_bottom,
|
88 |
+
'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
|
89 |
+
'font_color': None
|
90 |
+
}
|
91 |
+
if cell.text_frame.paragraphs:
|
92 |
+
paragraph = cell.text_frame.paragraphs[0]
|
93 |
+
if paragraph.runs:
|
94 |
+
run = paragraph.runs[0]
|
95 |
+
if hasattr(run.font, 'size') and run.font.size is not None:
|
96 |
+
cell_data['font_size'] = run.font.size.pt
|
97 |
+
if hasattr(run.font, 'name'):
|
98 |
+
cell_data['font_name'] = run.font.name
|
99 |
+
if hasattr(run.font, 'bold'):
|
100 |
+
cell_data['bold'] = run.font.bold
|
101 |
+
if hasattr(run.font, 'italic'):
|
102 |
+
cell_data['italic'] = run.font.italic
|
103 |
+
if (hasattr(run.font, 'color') and
|
104 |
+
run.font.color is not None and
|
105 |
+
hasattr(run.font.color, 'rgb') and
|
106 |
+
run.font.color.rgb is not None):
|
107 |
+
cell_data['font_color'] = str(run.font.color.rgb)
|
108 |
+
if hasattr(paragraph, 'alignment'):
|
109 |
+
cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
|
110 |
+
row_data.append(cell_data)
|
111 |
+
table_data['cells'].append(row_data)
|
112 |
+
return table_data
|
113 |
+
|
114 |
+
def get_shape_properties(shape):
|
115 |
+
"""Extract all properties from a shape, with detailed debug prints."""
|
116 |
+
shape_data = {
|
117 |
+
'text': '',
|
118 |
+
'font_size': None,
|
119 |
+
'font_name': None,
|
120 |
+
'alignment': None,
|
121 |
+
'width': shape.width,
|
122 |
+
'height': shape.height,
|
123 |
+
'left': shape.left,
|
124 |
+
'top': shape.top,
|
125 |
+
'bold': None,
|
126 |
+
'italic': None,
|
127 |
+
'line_spacing_info': {
|
128 |
+
'rule': None,
|
129 |
+
'value': None
|
130 |
+
},
|
131 |
+
'space_before': None,
|
132 |
+
'space_after': None,
|
133 |
+
'font_color': None
|
134 |
+
}
|
135 |
+
|
136 |
+
if hasattr(shape, "text"):
|
137 |
+
shape_data['text'] = shape.text.strip()
|
138 |
+
if hasattr(shape, 'text_frame'):
|
139 |
+
for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
|
140 |
+
if paragraph.runs:
|
141 |
+
run = paragraph.runs[0] # Assuming properties are mostly consistent in the first run
|
142 |
+
if hasattr(run.font, 'size') and run.font.size is not None:
|
143 |
+
shape_data['font_size'] = run.font.size.pt
|
144 |
+
if hasattr(run.font, 'name'):
|
145 |
+
shape_data['font_name'] = run.font.name
|
146 |
+
if hasattr(run.font, 'bold'):
|
147 |
+
shape_data['bold'] = run.font.bold
|
148 |
+
if hasattr(run.font, 'italic'):
|
149 |
+
shape_data['italic'] = run.font.italic
|
150 |
+
if (hasattr(run.font, 'color') and
|
151 |
+
run.font.color is not None and
|
152 |
+
hasattr(run.font.color, 'rgb') and
|
153 |
+
run.font.color.rgb is not None):
|
154 |
+
shape_data['font_color'] = str(run.font.color.rgb)
|
155 |
+
|
156 |
+
if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
|
157 |
+
shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
|
158 |
+
if hasattr(paragraph, 'space_before'):
|
159 |
+
shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
|
160 |
+
if hasattr(paragraph, 'space_after'):
|
161 |
+
shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
|
162 |
+
|
163 |
+
if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
|
164 |
+
line_spacing = paragraph.line_spacing
|
165 |
+
|
166 |
+
# Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
|
167 |
+
if isinstance(line_spacing, Pt) or line_spacing > 10:
|
168 |
+
line_spacing_rule = "EXACTLY"
|
169 |
+
elif isinstance(line_spacing, float):
|
170 |
+
line_spacing_rule = "MULTIPLE"
|
171 |
+
else:
|
172 |
+
line_spacing_rule = "UNKNOWN"
|
173 |
+
|
174 |
+
shape_data['line_spacing_info'] = {
|
175 |
+
'rule': line_spacing_rule,
|
176 |
+
'value': line_spacing if isinstance(line_spacing, float) else None
|
177 |
+
}
|
178 |
+
|
179 |
+
return shape_data
|
180 |
+
|
181 |
+
def apply_shape_properties(shape, shape_data):
|
182 |
+
"""Apply saved properties to a shape."""
|
183 |
+
try:
|
184 |
+
shape.width = shape_data['width']
|
185 |
+
shape.height = shape_data['height']
|
186 |
+
shape.left = shape_data['left']
|
187 |
+
shape.top = shape_data['top']
|
188 |
+
shape.text = ""
|
189 |
+
paragraph = shape.text_frame.paragraphs[0]
|
190 |
+
run = paragraph.add_run()
|
191 |
+
run.text = shape_data['text']
|
192 |
+
if shape_data['font_size']:
|
193 |
+
adjusted_size = shape_data['font_size'] * 0.9
|
194 |
+
run.font.size = Pt(adjusted_size)
|
195 |
+
|
196 |
+
if shape_data.get('font_name'):
|
197 |
+
run.font.name = shape_data['font_name']
|
198 |
+
else:
|
199 |
+
run.font.name = "Arial"
|
200 |
+
if shape_data.get('font_color'):
|
201 |
+
run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
|
202 |
+
if shape_data['bold'] is not None:
|
203 |
+
run.font.bold = shape_data['bold']
|
204 |
+
if shape_data['italic'] is not None:
|
205 |
+
run.font.italic = shape_data['italic']
|
206 |
+
if shape_data['alignment']:
|
207 |
+
paragraph.alignment = get_alignment_value(shape_data['alignment'])
|
208 |
+
|
209 |
+
line_spacing_info = shape_data.get('line_spacing_info', {})
|
210 |
+
line_spacing_rule = line_spacing_info.get('rule')
|
211 |
+
line_spacing_value = line_spacing_info.get('value')
|
212 |
+
|
213 |
+
if line_spacing_rule and line_spacing_value is not None:
|
214 |
+
if line_spacing_rule == "EXACTLY":
|
215 |
+
paragraph.line_spacing = Pt(line_spacing_value)
|
216 |
+
elif line_spacing_rule == "AT_LEAST":
|
217 |
+
paragraph.line_spacing = Pt(line_spacing_value)
|
218 |
+
elif line_spacing_rule == "MULTIPLE":
|
219 |
+
paragraph.line_spacing = line_spacing_value
|
220 |
+
else:
|
221 |
+
print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
|
222 |
+
|
223 |
+
if shape_data['space_before']:
|
224 |
+
paragraph.space_before = shape_data['space_before']
|
225 |
+
if shape_data['space_after']:
|
226 |
+
paragraph.space_after = shape_data['space_after']
|
227 |
+
|
228 |
+
|
229 |
+
except Exception as e:
|
230 |
+
print(f"Error applying shape properties: {str(e)}")
|
231 |
+
|
232 |
+
|
233 |
+
def apply_table_properties(table, table_data):
|
234 |
+
"""Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
|
235 |
+
for row_idx, row in enumerate(table.rows):
|
236 |
+
for col_idx, cell in enumerate(row.cells):
|
237 |
+
try:
|
238 |
+
cell_data = table_data['cells'][row_idx][col_idx]
|
239 |
+
|
240 |
+
# Áp dụng margin
|
241 |
+
cell.margin_left = cell_data.get('margin_left', 0)
|
242 |
+
cell.margin_right = cell_data.get('margin_right', 0)
|
243 |
+
cell.margin_top = cell_data.get('margin_top', 0)
|
244 |
+
cell.margin_bottom = cell_data.get('margin_bottom', 0)
|
245 |
+
|
246 |
+
# Áp dụng vertical_anchor (tránh dùng eval)
|
247 |
+
if 'vertical_anchor' in cell_data:
|
248 |
+
cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
|
249 |
+
|
250 |
+
# Xóa nội dung cũ và thiết lập văn bản mới
|
251 |
+
cell.text = ""
|
252 |
+
paragraph = cell.text_frame.paragraphs[0]
|
253 |
+
run = paragraph.add_run()
|
254 |
+
run.text = cell_data.get('text', "")
|
255 |
+
|
256 |
+
# Thiết lập kích thước font
|
257 |
+
if 'font_size' in cell_data:
|
258 |
+
adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
|
259 |
+
run.font.size = Pt(adjusted_size)
|
260 |
+
|
261 |
+
# Thiết lập font chữ
|
262 |
+
run.font.name = cell_data.get('font_name', 'Arial')
|
263 |
+
|
264 |
+
# Màu chữ
|
265 |
+
if 'font_color' in cell_data:
|
266 |
+
run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
|
267 |
+
|
268 |
+
# In đậm & in nghiêng
|
269 |
+
run.font.bold = cell_data.get('bold', False)
|
270 |
+
run.font.italic = cell_data.get('italic', False)
|
271 |
+
|
272 |
+
# Căn lề văn bản
|
273 |
+
if 'alignment' in cell_data:
|
274 |
+
paragraph.alignment = get_alignment_value(cell_data['alignment'])
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
|
278 |
+
|
279 |
+
|
280 |
+
def get_file_from_mongodb(db_name, collection_name, file_id):
|
281 |
+
"""Tải tệp từ MongoDB GridFS"""
|
282 |
+
client = MongoClient("mongodb://localhost:27017/")
|
283 |
+
db = client[db_name]
|
284 |
+
fs = GridFS(db, collection_name)
|
285 |
+
file_data = fs.get(file_id)
|
286 |
+
return BytesIO(file_data.read())
|
287 |
+
|
288 |
+
|
289 |
+
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
290 |
+
"""Lưu tệp vào MongoDB GridFS"""
|
291 |
+
client = MongoClient("mongodb://localhost:27017/")
|
292 |
+
db = client[db_name]
|
293 |
+
fs = GridFS(db, collection_name)
|
294 |
+
file_id = fs.put(file_data, filename=file_name)
|
295 |
+
return file_id
|
296 |
+
|
297 |
+
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
298 |
+
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
299 |
+
try:
|
300 |
+
# Kết nối MongoDB và tải file
|
301 |
+
original_ppt_io = get_file_from_mongodb(db_name, "root_file", original_ppt_id)
|
302 |
+
translated_xml_io = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
|
303 |
+
|
304 |
+
# Load PowerPoint gốc và XML dịch
|
305 |
+
prs = Presentation(original_ppt_io)
|
306 |
+
tree = ET.parse(translated_xml_io)
|
307 |
+
root = tree.getroot()
|
308 |
+
|
309 |
+
# Áp dụng bản dịch
|
310 |
+
for slide_number, slide in enumerate(prs.slides, 1):
|
311 |
+
xml_slide = root.find(f".//slide[@number='{slide_number}']")
|
312 |
+
if xml_slide is None:
|
313 |
+
continue
|
314 |
+
for shape_index, shape in enumerate(slide.shapes):
|
315 |
+
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
316 |
+
apply_group_properties_recursive(shape, shape_index, xml_slide)
|
317 |
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
318 |
+
table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
|
319 |
+
if table_element is not None:
|
320 |
+
props_element = table_element.find("properties")
|
321 |
+
if props_element is not None and props_element.text:
|
322 |
+
try:
|
323 |
+
table_data = json.loads(props_element.text)
|
324 |
+
apply_table_properties(shape.table, table_data)
|
325 |
+
except Exception as e:
|
326 |
+
print(f"Error applying table properties: {str(e)}")
|
327 |
+
elif hasattr(shape, "text"):
|
328 |
+
text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
|
329 |
+
if text_element is not None:
|
330 |
+
props_element = text_element.find("properties")
|
331 |
+
if props_element is not None and props_element.text:
|
332 |
+
try:
|
333 |
+
shape_data = json.loads(props_element.text)
|
334 |
+
apply_shape_properties(shape, shape_data)
|
335 |
+
except Exception as e:
|
336 |
+
print(f"Error applying shape properties: {str(e)}")
|
337 |
+
|
338 |
+
# Lưu PowerPoint vào MongoDB
|
339 |
+
output_io = BytesIO()
|
340 |
+
prs.save(output_io)
|
341 |
+
output_io.seek(0) # Reset vị trí đọc
|
342 |
+
|
343 |
+
file_id = save_file_to_mongodb(db_name, output_collection, "translated_presentation.pptx", output_io)
|
344 |
+
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
345 |
+
|
346 |
+
return file_id
|
347 |
+
except Exception as e:
|
348 |
+
print(f"Error creating translated PowerPoint: {str(e)}")
|
349 |
+
return None
|
350 |
+
|
351 |
+
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
352 |
+
"""Lưu tệp vào MongoDB GridFS"""
|
353 |
+
client = MongoClient("mongodb://localhost:27017/")
|
354 |
+
db = client[db_name]
|
355 |
+
fs = GridFS(db, collection_name)
|
356 |
+
file_id = fs.put(file_data, filename=file_name)
|
357 |
+
return file_id
|
powerpoint/pptx_processor.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ppt_processor.py
|
2 |
+
from pathlib import Path
|
3 |
+
from xml_handling import ppt_to_xml, translate_xml_file
|
4 |
+
from pptx_object import create_translated_ppt
|
5 |
+
import os
|
6 |
+
|
7 |
+
def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
|
8 |
+
"""Process a single PPT/PPTX file from XML extraction to final translation."""
|
9 |
+
ppt_path = ppt_path.strip("'\"")
|
10 |
+
ppt_path = ppt_path.replace("\\ ", " ")
|
11 |
+
ppt_path = ppt_path.replace("\\'", "'")
|
12 |
+
ppt_path = os.path.expanduser(ppt_path)
|
13 |
+
ppt_path = Path(ppt_path).resolve()
|
14 |
+
# chuyển thành link DB trên server
|
15 |
+
try:
|
16 |
+
if not ppt_path.is_file():
|
17 |
+
print(f"Error: '{ppt_path}' is not a valid file.")
|
18 |
+
return
|
19 |
+
if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
|
20 |
+
print(f"Error: '{ppt_path}' is not a PowerPoint file.")
|
21 |
+
return
|
22 |
+
|
23 |
+
base_dir = ppt_path.parent
|
24 |
+
|
25 |
+
# Original XML
|
26 |
+
print(f"Generating original XML for {ppt_path.name}...")
|
27 |
+
original_xml = ppt_to_xml(str(ppt_path))
|
28 |
+
if original_xml:
|
29 |
+
original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
|
30 |
+
with open(original_output_path, 'w', encoding='utf-8') as f:
|
31 |
+
f.write(original_xml)
|
32 |
+
print(f"Original XML saved: {original_output_path}")
|
33 |
+
|
34 |
+
# Save original XML to MongoDB
|
35 |
+
# save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
|
36 |
+
|
37 |
+
# Translated XML
|
38 |
+
print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
|
39 |
+
translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
|
40 |
+
original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
|
41 |
+
translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
|
42 |
+
|
43 |
+
# Create Translated PPT
|
44 |
+
print(f"Creating translated PPT for {ppt_path.name}...")
|
45 |
+
output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
|
46 |
+
output_ppt_path = base_dir / output_filename
|
47 |
+
create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
|
48 |
+
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")
|
powerpoint/xml_handling.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import xml.etree.ElementTree as ET
|
2 |
+
from xml.dom import minidom
|
3 |
+
import json
|
4 |
+
from typing import Dict, List
|
5 |
+
from concurrent.futures import ThreadPoolExecutor
|
6 |
+
from pptx import Presentation
|
7 |
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
8 |
+
from powerpoint.pptx_object import get_table_properties, get_shape_properties
|
9 |
+
from pymongo import MongoClient
|
10 |
+
import gridfs
|
11 |
+
from bson import ObjectId
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
|
15 |
+
gemini_api = "AIzaSyDtBIjTSfbvuEsobNwjtdyi9gVpDrCaWPM"
|
16 |
+
|
17 |
+
def extract_text_from_group(group_shape, slide_number, shape_index, slide_element):
|
18 |
+
"""Extracts text from shapes within a group, only adding the group if it contains text."""
|
19 |
+
group_element = ET.SubElement(slide_element, "group_element")
|
20 |
+
group_element.set("shape_index", str(shape_index))
|
21 |
+
group_element.set("group_name", group_shape.name) # Add group name
|
22 |
+
|
23 |
+
group_has_text = False # Flag to track if the group contains any text
|
24 |
+
|
25 |
+
for i, shape in enumerate(group_shape.shapes):
|
26 |
+
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
27 |
+
# Recursively check nested groups, and update group_has_text
|
28 |
+
if extract_text_from_group(shape, slide_number, i, group_element):
|
29 |
+
group_has_text = True
|
30 |
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
31 |
+
table_element = ET.SubElement(group_element, "table_element")
|
32 |
+
table_element.set("shape_index", str(i))
|
33 |
+
table_data = get_table_properties(shape.table)
|
34 |
+
props_element = ET.SubElement(table_element, "properties")
|
35 |
+
props_element.text = json.dumps(table_data, indent=2)
|
36 |
+
group_has_text = True
|
37 |
+
elif hasattr(shape, "text_frame") and shape.text_frame:
|
38 |
+
text_element = ET.SubElement(group_element, "text_element")
|
39 |
+
text_element.set("shape_index", str(i))
|
40 |
+
shape_data = get_shape_properties(shape)
|
41 |
+
props_element = ET.SubElement(text_element, "properties")
|
42 |
+
props_element.text = json.dumps(shape_data, indent=2)
|
43 |
+
if shape_data.get("text") or (
|
44 |
+
"paragraphs" in shape_data
|
45 |
+
and any(p.get("text") for p in shape_data["paragraphs"])
|
46 |
+
):
|
47 |
+
group_has_text = True
|
48 |
+
|
49 |
+
# Only keep the group element if it contains text
|
50 |
+
if not group_has_text:
|
51 |
+
slide_element.remove(group_element)
|
52 |
+
return False
|
53 |
+
return True
|
54 |
+
|
55 |
+
def extract_text_from_slide(slide, slide_number, translate=False):
|
56 |
+
"""Extract all text elements from a slide."""
|
57 |
+
slide_element = ET.Element("slide")
|
58 |
+
slide_element.set("number", str(slide_number))
|
59 |
+
|
60 |
+
for shape_index, shape in enumerate(slide.shapes):
|
61 |
+
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
62 |
+
extract_text_from_group(shape, slide_number, shape_index, slide_element)
|
63 |
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
64 |
+
table_element = ET.SubElement(slide_element, "table_element")
|
65 |
+
table_element.set("shape_index", str(shape_index))
|
66 |
+
table_data = get_table_properties(shape.table)
|
67 |
+
props_element = ET.SubElement(table_element, "properties")
|
68 |
+
props_element.text = json.dumps(table_data, indent=2)
|
69 |
+
elif hasattr(shape, "text"):
|
70 |
+
text_element = ET.SubElement(slide_element, "text_element")
|
71 |
+
text_element.set("shape_index", str(shape_index))
|
72 |
+
shape_data = get_shape_properties(shape)
|
73 |
+
props_element = ET.SubElement(text_element, "properties")
|
74 |
+
props_element.text = json.dumps(shape_data, indent=2)
|
75 |
+
return slide_element
|
76 |
+
|
77 |
+
def ppt_to_xml_mongodb(ppt_file_id: str, db_name="ppt"):
|
78 |
+
"""
|
79 |
+
Chuyển PowerPoint từ MongoDB thành XML và lưu vào MongoDB.
|
80 |
+
|
81 |
+
:param ppt_file_id: ID của file PPT gốc trong MongoDB (original_pptx)
|
82 |
+
:param db_name: Tên database MongoDB
|
83 |
+
:return: ID của file XML trong MongoDB (original_xml)
|
84 |
+
"""
|
85 |
+
# Kết nối MongoDB
|
86 |
+
client = MongoClient("mongodb://localhost:27017/")
|
87 |
+
db = client[db_name]
|
88 |
+
|
89 |
+
fs_ppt = gridfs.GridFS(db, collection="root_file") # PPT gốc
|
90 |
+
fs_xml = gridfs.GridFS(db, collection="original_xml") # XML lưu trữ
|
91 |
+
|
92 |
+
try:
|
93 |
+
# Lấy file PPT từ MongoDB
|
94 |
+
if not isinstance(ppt_file_id, ObjectId):
|
95 |
+
ppt_file_id = ObjectId(ppt_file_id)
|
96 |
+
ppt_file = fs_ppt.get(ppt_file_id)
|
97 |
+
prs = Presentation(BytesIO(ppt_file.read()))
|
98 |
+
|
99 |
+
# Tạo XML
|
100 |
+
root = ET.Element("presentation")
|
101 |
+
root.set("file_name", ppt_file.filename)
|
102 |
+
|
103 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
104 |
+
future_to_slide = {
|
105 |
+
executor.submit(extract_text_from_slide, slide, slide_number): slide_number
|
106 |
+
for slide_number, slide in enumerate(prs.slides, 1)
|
107 |
+
}
|
108 |
+
for future in future_to_slide:
|
109 |
+
slide_number = future_to_slide[future]
|
110 |
+
try:
|
111 |
+
slide_element = future.result()
|
112 |
+
root.append(slide_element)
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Error processing slide {slide_number}: {str(e)}")
|
115 |
+
|
116 |
+
xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
|
117 |
+
|
118 |
+
# Lưu XML vào MongoDB
|
119 |
+
xml_output = BytesIO(xml_str.encode("utf-8"))
|
120 |
+
xml_file_id = fs_xml.put(xml_output, filename=f"{ppt_file.filename}.xml")
|
121 |
+
|
122 |
+
print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
|
123 |
+
|
124 |
+
return xml_file_id
|
125 |
+
|
126 |
+
except Exception as e:
|
127 |
+
print(f"❌ Lỗi khi chuyển PPT sang XML: {str(e)}")
|
128 |
+
return None
|
129 |
+
finally:
|
130 |
+
client.close()
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
def extract_text_from_xml(file_id=None, filename=None, db_name="ppt", collection_name="original_xml") -> Dict[str, List[str]]:
|
136 |
+
"""
|
137 |
+
Tải XML từ MongoDB và trích xuất văn bản từ các slide.
|
138 |
+
|
139 |
+
:param file_id: ID của file trong MongoDB (dạng ObjectId hoặc string)
|
140 |
+
:param filename: Tên file cần tìm trong MongoDB (VD: "file.xml")
|
141 |
+
:param db_name: Tên database MongoDB
|
142 |
+
:param collection_name: Tên collection GridFS
|
143 |
+
:return: Dictionary {slide_number: [text1, text2, ...]}
|
144 |
+
"""
|
145 |
+
# Kết nối MongoDB
|
146 |
+
client = MongoClient("mongodb://localhost:27017/")
|
147 |
+
db = client[db_name]
|
148 |
+
fs = gridfs.GridFS(db, collection=collection_name)
|
149 |
+
|
150 |
+
try:
|
151 |
+
# Tìm file theo file_id hoặc filename
|
152 |
+
if file_id:
|
153 |
+
if not isinstance(file_id, ObjectId):
|
154 |
+
file_id = ObjectId(file_id)
|
155 |
+
file_data = fs.get(file_id)
|
156 |
+
elif filename:
|
157 |
+
file_data = fs.find_one({"filename": filename})
|
158 |
+
if not file_data:
|
159 |
+
print(f"❌ Không tìm thấy file '{filename}' trong MongoDB!")
|
160 |
+
return {}
|
161 |
+
else:
|
162 |
+
print("❌ Cần cung cấp 'file_id' hoặc 'filename' để tải file.")
|
163 |
+
return {}
|
164 |
+
|
165 |
+
# Đọc nội dung XML từ MongoDB
|
166 |
+
xml_content = file_data.read().decode("utf-8")
|
167 |
+
# print(f"✅ xml_content: {xml_content}")
|
168 |
+
# Chuyển đổi thành cây XML
|
169 |
+
root = ET.fromstring(xml_content)
|
170 |
+
slide_texts = {}
|
171 |
+
|
172 |
+
# Duyệt qua từng slide
|
173 |
+
for slide in root.findall("slide"):
|
174 |
+
slide_number = slide.get("number")
|
175 |
+
texts = []
|
176 |
+
# Helper function to extract text recursively
|
177 |
+
def extract_text_recursive(element):
|
178 |
+
if element.tag == "text_element":
|
179 |
+
props = element.find("properties")
|
180 |
+
if props is not None and props.text:
|
181 |
+
try:
|
182 |
+
shape_data = json.loads(props.text)
|
183 |
+
# Handle both direct 'text' and paragraph-based text
|
184 |
+
if 'text' in shape_data:
|
185 |
+
texts.append(shape_data['text'])
|
186 |
+
elif 'paragraphs' in shape_data:
|
187 |
+
for paragraph in shape_data['paragraphs']:
|
188 |
+
if 'text' in paragraph:
|
189 |
+
texts.append(paragraph['text'])
|
190 |
+
#Also extract run level text
|
191 |
+
elif 'runs' in paragraph:
|
192 |
+
for run in paragraph['runs']:
|
193 |
+
if 'text' in run:
|
194 |
+
texts.append(run['text'])
|
195 |
+
|
196 |
+
|
197 |
+
except json.JSONDecodeError:
|
198 |
+
pass # Ignore if JSON is invalid
|
199 |
+
|
200 |
+
elif element.tag == "table_element":
|
201 |
+
props = element.find("properties")
|
202 |
+
if props is not None and props.text:
|
203 |
+
try:
|
204 |
+
table_data = json.loads(props.text)
|
205 |
+
for row in table_data.get("cells", []):
|
206 |
+
for cell in row:
|
207 |
+
texts.append(cell.get("text", ""))
|
208 |
+
except json.JSONDecodeError:
|
209 |
+
pass # Ignore if JSON is invalid
|
210 |
+
|
211 |
+
# Recursively process children of group_element
|
212 |
+
elif element.tag == "group_element":
|
213 |
+
for child in element:
|
214 |
+
extract_text_recursive(child)
|
215 |
+
|
216 |
+
# Iterate through all direct children of the slide
|
217 |
+
for child in slide:
|
218 |
+
extract_text_recursive(child)
|
219 |
+
|
220 |
+
slide_texts[str(slide_number)] = texts # Ensure slide number is a string
|
221 |
+
print(slide_texts)
|
222 |
+
return slide_texts
|
223 |
+
|
224 |
+
except Exception as e:
|
225 |
+
print(f"❌ Lỗi khi xử lý XML: {e}")
|
226 |
+
return {}
|
227 |
+
finally:
|
228 |
+
client.close()
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
def adjust_size(original_text, translated_text, data_container):
|
236 |
+
"""Adjust font size if translated text is significantly longer."""
|
237 |
+
|
238 |
+
if not original_text or not translated_text:
|
239 |
+
return
|
240 |
+
|
241 |
+
original_len = len(original_text)
|
242 |
+
translated_len = len(translated_text)
|
243 |
+
length_ratio = translated_len / original_len if original_len >0 else 1 # Avoid division by 0
|
244 |
+
|
245 |
+
if length_ratio > 1.5: # Adjust threshold as needed
|
246 |
+
if 'paragraphs' in data_container:
|
247 |
+
for paragraph in data_container['paragraphs']:
|
248 |
+
if 'runs' in paragraph:
|
249 |
+
for run in paragraph['runs']:
|
250 |
+
if run.get('font') and run['font'].get('size'):
|
251 |
+
run['font']['size'] = max(6, int(run['font']['size'] * 0.8))
|
252 |
+
|
253 |
+
elif 'font' in data_container and data_container['font'].get('size'):
|
254 |
+
data_container['font']['size'] = max(6, int(data_container['font']['size'] * 0.8))
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[str, List[str]], db_name="ppt"):
|
261 |
+
"""
|
262 |
+
Tải XML từ MongoDB (collection original_xml), cập nhật nội dung dịch, và lưu lại vào collection final_xml.
|
263 |
+
|
264 |
+
:param file_id: ID của file trong MongoDB (original_xml)
|
265 |
+
:param translated_dict: Dictionary {slide_number: [translated_text1, translated_text2, ...]}
|
266 |
+
:param db_name: Tên database MongoDB
|
267 |
+
"""
|
268 |
+
# Kết nối MongoDB
|
269 |
+
client = MongoClient("mongodb://localhost:27017/")
|
270 |
+
db = client[db_name]
|
271 |
+
|
272 |
+
fs_original = gridfs.GridFS(db, collection="original_xml") # Lấy file từ original_xml
|
273 |
+
fs_final = gridfs.GridFS(db, collection="final_xml") # Lưu file vào final_xml
|
274 |
+
|
275 |
+
try:
|
276 |
+
# Tải file từ MongoDB (original_xml)
|
277 |
+
if not isinstance(file_id, ObjectId):
|
278 |
+
file_id = ObjectId(file_id)
|
279 |
+
file_data = fs_original.get(file_id)
|
280 |
+
xml_content = file_data.read().decode("utf-8")
|
281 |
+
|
282 |
+
# Chuyển đổi XML string thành cây XML
|
283 |
+
root = ET.fromstring(xml_content)
|
284 |
+
|
285 |
+
# Cập nhật nội dung dịch
|
286 |
+
for slide in root.findall("slide"):
|
287 |
+
slide_num = slide.get("number")
|
288 |
+
if slide_num in translated_dict:
|
289 |
+
translated_texts = translated_dict[slide_num]
|
290 |
+
text_index = 0 # Keep track of the current translated text
|
291 |
+
|
292 |
+
def update_element_recursive(element):
|
293 |
+
nonlocal text_index # Access and modify the outer scope's index
|
294 |
+
|
295 |
+
if element.tag == "text_element":
|
296 |
+
props = element.find("properties")
|
297 |
+
if props is not None and props.text:
|
298 |
+
try:
|
299 |
+
shape_data = json.loads(props.text)
|
300 |
+
original_text = ""
|
301 |
+
|
302 |
+
# Handle direct text and paragraph-based text
|
303 |
+
if 'text' in shape_data:
|
304 |
+
original_text = shape_data['text']
|
305 |
+
if text_index < len(translated_texts):
|
306 |
+
shape_data['text'] = translated_texts[text_index]
|
307 |
+
adjust_size(original_text, translated_texts[text_index], shape_data)
|
308 |
+
text_index += 1
|
309 |
+
elif 'paragraphs' in shape_data:
|
310 |
+
for paragraph in shape_data['paragraphs']:
|
311 |
+
if 'text' in paragraph:
|
312 |
+
original_text = paragraph['text']
|
313 |
+
if text_index < len(translated_texts):
|
314 |
+
paragraph['text'] = translated_texts[text_index]
|
315 |
+
adjust_size(original_text, translated_texts[text_index], paragraph)
|
316 |
+
text_index += 1
|
317 |
+
elif 'runs' in paragraph:
|
318 |
+
for run in paragraph['runs']:
|
319 |
+
if 'text' in run:
|
320 |
+
original_text = run['text']
|
321 |
+
if text_index < len(translated_texts):
|
322 |
+
run['text'] = translated_texts[text_index]
|
323 |
+
adjust_size(original_text, translated_texts[text_index], run)
|
324 |
+
text_index += 1
|
325 |
+
props.text = json.dumps(shape_data, indent=2)
|
326 |
+
except json.JSONDecodeError:
|
327 |
+
print(f"JSONDecodeError in text_element on slide {slide_num}")
|
328 |
+
|
329 |
+
elif element.tag == "table_element":
|
330 |
+
props = element.find("properties")
|
331 |
+
if props is not None and props.text:
|
332 |
+
try:
|
333 |
+
table_data = json.loads(props.text)
|
334 |
+
for row in table_data.get("cells", []):
|
335 |
+
for cell in row:
|
336 |
+
original_text = cell.get('text', '')
|
337 |
+
if text_index < len(translated_texts):
|
338 |
+
cell['text'] = translated_texts[text_index]
|
339 |
+
adjust_size(original_text, translated_texts[text_index], cell)
|
340 |
+
text_index += 1
|
341 |
+
props.text = json.dumps(table_data, indent=2)
|
342 |
+
except json.JSONDecodeError:
|
343 |
+
print(f"JSONDecodeError in table_element on slide {slide_num}")
|
344 |
+
|
345 |
+
elif element.tag == "group_element":
|
346 |
+
print("Group element found")
|
347 |
+
for child in element:
|
348 |
+
update_element_recursive(child) # Recursively process children
|
349 |
+
|
350 |
+
# Start the recursive update from the slide's direct children
|
351 |
+
for child in slide:
|
352 |
+
update_element_recursive(child)
|
353 |
+
|
354 |
+
# Chuyển XML thành chuỗi và làm đẹp định dạng
|
355 |
+
updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
|
356 |
+
|
357 |
+
# Lưu file cập nhật vào MongoDB (final_xml)
|
358 |
+
new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}_translated.xml")
|
359 |
+
print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
|
360 |
+
|
361 |
+
return new_file_id
|
362 |
+
|
363 |
+
except Exception as e:
|
364 |
+
print(f"❌ Lỗi khi cập nhật XML: {e}")
|
365 |
+
return None
|
366 |
+
finally:
|
367 |
+
client.close()
|
368 |
+
|
test.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
translate/translator.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import Dict, List
|
3 |
+
from google import genai
|
4 |
+
|
5 |
+
def translate_text_dict(text_dict: Dict[str, List[str]], source_lang: str, target_lang: str = "vi", gemini_api: str = "") -> Dict[str, List[str]]:
|
6 |
+
def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
|
7 |
+
"""Translates a single batch of text."""
|
8 |
+
prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}
|
9 |
+
The text is in {source_lang}, with a chance of there being phrases in other languages as well.
|
10 |
+
|
11 |
+
Read through the entire dictionary, then translate the texts into {target_lang} so that the meaning is as close to the intended context as possible.
|
12 |
+
|
13 |
+
Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
|
14 |
+
Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.
|
15 |
+
|
16 |
+
Aim for brevity if possible so that the length of the translations match the length of the original texts, but prioritize accuracy above all .
|
17 |
+
Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
|
18 |
+
|
19 |
+
client = genai.Client(api_key=gemini_api)
|
20 |
+
response = client.models.generate_content(
|
21 |
+
model="gemini-2.0-flash", contents=prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
|
22 |
+
|
23 |
+
# Handle potential errors in the response, including rate limits and invalid JSON.
|
24 |
+
try:
|
25 |
+
# More robust JSON parsing: Handle code blocks, markdown, and other variations.
|
26 |
+
response_text = response.text
|
27 |
+
start = response_text.find('{')
|
28 |
+
end = response_text.rfind('}') + 1
|
29 |
+
|
30 |
+
if start == -1 or end == -1:
|
31 |
+
raise ValueError("Invalid JSON response from Gemini API: No object found.")
|
32 |
+
|
33 |
+
json_string = response_text[start:end]
|
34 |
+
trans_dict = json.loads(json_string)
|
35 |
+
return trans_dict
|
36 |
+
except (ValueError, json.JSONDecodeError) as e:
|
37 |
+
print(f"Error processing Gemini API response: {e}")
|
38 |
+
print(f"Raw response text: {response.text}") # Print the raw response for debugging
|
39 |
+
return {} # Return an empty dict on error (or raise, depending on your needs)
|
40 |
+
except Exception as e:
|
41 |
+
print(f"An unexpected error occur: {e}")
|
42 |
+
return {}
|
43 |
+
|
44 |
+
|
45 |
+
batch_size = 30 # Adjust as needed, based on testing and Gemini's context window limits
|
46 |
+
translated_dict = {}
|
47 |
+
keys = list(text_dict.keys())
|
48 |
+
|
49 |
+
# Process in batches
|
50 |
+
for i in range(0, len(keys), batch_size):
|
51 |
+
batch_keys = keys[i:i + batch_size]
|
52 |
+
batch_dict = {key: text_dict[key] for key in batch_keys}
|
53 |
+
translated_batch = translate_batch(batch_dict)
|
54 |
+
|
55 |
+
# Merge results
|
56 |
+
if translated_batch: # Only merge if the translation was successful
|
57 |
+
translated_dict.update(translated_batch)
|
58 |
+
else:
|
59 |
+
print(f"Skipping batch {i // batch_size} due to translation error.")
|
60 |
+
|
61 |
+
return translated_dict
|
62 |
+
|
63 |
+
|
64 |
+
|
word/word_translate.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import docx
|
3 |
+
from docx import Document
|
4 |
+
from google import genai # Use OpenAI for LLM translation
|
5 |
+
import ast
|
6 |
+
import json
|
7 |
+
from docx.oxml import OxmlElement
|
8 |
+
from copy import deepcopy
|
9 |
+
import io
|
10 |
+
from pymongo import MongoClient
|
11 |
+
from gridfs import GridFS
|
12 |
+
from docx import Document
|
13 |
+
from deep_translator import GoogleTranslator
|
14 |
+
|
15 |
+
gemini_api = "AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg"
|
16 |
+
target_language = 'vi'
|
17 |
+
source_language = 'en'
|
18 |
+
|
19 |
+
def batch_translate(texts, source_lang = 'en', target_lang="fr"):
|
20 |
+
""" Translates multiple text segments in a single API call. """
|
21 |
+
if not texts:
|
22 |
+
return texts # Skip if empty
|
23 |
+
|
24 |
+
prompt = f"""
|
25 |
+
Translate the following JSON file from {source_lang} into {target_lang} while preserving names, links, symbols, and formatting:
|
26 |
+
{json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])}
|
27 |
+
|
28 |
+
- The original JSON file contains a Python array of objects, each with "index" and "text" keys.
|
29 |
+
- Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
|
30 |
+
- Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
|
31 |
+
- Return only valid JSON — a Python array of translated objects.
|
32 |
+
- If the original array is empty, return an empty array.
|
33 |
+
"""
|
34 |
+
|
35 |
+
client = genai.Client(api_key=gemini_api)
|
36 |
+
response = client.models.generate_content(
|
37 |
+
model="gemini-2.0-flash", contents=prompt)
|
38 |
+
|
39 |
+
translated_output = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
|
40 |
+
|
41 |
+
return [item["text"] for item in translated_output]
|
42 |
+
|
43 |
+
def merge_runs(runs):
|
44 |
+
""" Merges adjacent runs with the same style. """
|
45 |
+
merged_runs = []
|
46 |
+
for run in runs:
|
47 |
+
if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run):
|
48 |
+
if (
|
49 |
+
merged_runs and
|
50 |
+
run.style == merged_runs[-1].style and
|
51 |
+
merged_runs[-1].bold == run.bold and
|
52 |
+
merged_runs[-1].italic == run.italic and
|
53 |
+
merged_runs[-1].underline == run.underline and
|
54 |
+
merged_runs[-1].font.size == run.font.size and
|
55 |
+
merged_runs[-1].font.color.rgb == run.font.color.rgb and
|
56 |
+
merged_runs[-1].font.name == run.font.name
|
57 |
+
):
|
58 |
+
merged_runs[-1].text += run.text
|
59 |
+
else:
|
60 |
+
merged_runs.append(run)
|
61 |
+
return merged_runs
|
62 |
+
|
63 |
+
NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
64 |
+
|
65 |
+
def translate_paragraphs(doc, source_lang, target_lang):
|
66 |
+
paragraphs = []
|
67 |
+
for para in doc.paragraphs:
|
68 |
+
for run in merge_runs(para.iter_inner_content()):
|
69 |
+
if isinstance(run, docx.text.run.Run):
|
70 |
+
paragraphs.append(run.text)
|
71 |
+
# paragraphs = merge_runs(paragraphs)
|
72 |
+
translated_paragraphs = []
|
73 |
+
temp_batch = []
|
74 |
+
words = 0
|
75 |
+
for para in paragraphs:
|
76 |
+
if len(para) + words > 5000:
|
77 |
+
translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
|
78 |
+
temp_batch = []
|
79 |
+
words = 0
|
80 |
+
words += len(para)
|
81 |
+
temp_batch.append(para)
|
82 |
+
translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
|
83 |
+
# translated_paragraphs = batch_translate(paragraphs, target_lang)
|
84 |
+
|
85 |
+
if len(translated_paragraphs) > 0:
|
86 |
+
# Replace translated text back
|
87 |
+
para_index = 0
|
88 |
+
for para in doc.paragraphs:
|
89 |
+
original_para = deepcopy(para)
|
90 |
+
para.clear() # Remove text while keeping paragraph properties
|
91 |
+
for run in merge_runs(original_para.iter_inner_content()):
|
92 |
+
if isinstance(run, docx.text.run.Run):
|
93 |
+
translated_text = translated_paragraphs[para_index]
|
94 |
+
try:
|
95 |
+
translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
|
96 |
+
except UnicodeEncodeError:
|
97 |
+
translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
|
98 |
+
drawing = run._element.find(f".//{NS_W}drawing")
|
99 |
+
pict = run._element.find(".//{NS_W}pict")
|
100 |
+
|
101 |
+
# Create a new run with translated text and copy the formatting
|
102 |
+
new_run = para.add_run(translated_text)
|
103 |
+
new_run.style = run.style
|
104 |
+
|
105 |
+
if drawing is not None:
|
106 |
+
new_run._element.append(drawing)
|
107 |
+
elif pict is not None:
|
108 |
+
new_run._element.append(pict)
|
109 |
+
|
110 |
+
# Copy formatting from original run
|
111 |
+
new_run.bold = run.bold
|
112 |
+
new_run.italic = run.italic
|
113 |
+
new_run.underline = run.underline
|
114 |
+
new_run.font.size = run.font.size
|
115 |
+
new_run.font.color.rgb = run.font.color.rgb
|
116 |
+
new_run.font.name = run.font.name
|
117 |
+
para_index += 1
|
118 |
+
elif isinstance(run, docx.text.hyperlink.Hyperlink):
|
119 |
+
parent = run._element
|
120 |
+
tag = parent.tag.split("}")[-1]
|
121 |
+
|
122 |
+
# Create a new hyperlink element with the correct namespace
|
123 |
+
new_hyperlink = OxmlElement(f"w:{tag}")
|
124 |
+
for attr in parent.attrib:
|
125 |
+
new_hyperlink.set(attr, parent.get(attr))
|
126 |
+
for child in parent:
|
127 |
+
new_hyperlink.append(child)
|
128 |
+
para._element.append(new_hyperlink)
|
129 |
+
|
130 |
+
|
131 |
+
def translate_tables(doc, source_lang, target_lang):
|
132 |
+
table_texts = []
|
133 |
+
run_mapping = {}
|
134 |
+
|
135 |
+
|
136 |
+
for table in doc.tables:
|
137 |
+
for row in table.rows:
|
138 |
+
for cell in row.cells:
|
139 |
+
for para in cell.paragraphs:
|
140 |
+
for run in merge_runs(para.iter_inner_content()):
|
141 |
+
if isinstance(run, docx.text.run.Run):
|
142 |
+
table_texts.append(run.text)
|
143 |
+
|
144 |
+
translated_tables = []
|
145 |
+
temp_batch = []
|
146 |
+
words = 0
|
147 |
+
for para in table_texts:
|
148 |
+
if len(para) + words > 5000:
|
149 |
+
translated_tables += batch_translate(temp_batch, source_lang, target_lang)
|
150 |
+
temp_batch = []
|
151 |
+
words = 0
|
152 |
+
words += len(para)
|
153 |
+
temp_batch.append(para)
|
154 |
+
translated_tables += batch_translate(temp_batch, source_lang, target_lang)
|
155 |
+
# translated_tables = batch_translate(table_texts, target_lang)
|
156 |
+
|
157 |
+
if len(translated_tables) > 0:
|
158 |
+
table_index = 0
|
159 |
+
for table in doc.tables:
|
160 |
+
for row in table.rows:
|
161 |
+
for cell in row.cells:
|
162 |
+
for para in cell.paragraphs:
|
163 |
+
original_para = deepcopy(para)
|
164 |
+
para.clear() # Remove text while keeping paragraph properties
|
165 |
+
for run in merge_runs(original_para.iter_inner_content()):
|
166 |
+
if isinstance(run, docx.text.run.Run):
|
167 |
+
translated_text = translated_tables[table_index]
|
168 |
+
try:
|
169 |
+
translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
|
170 |
+
except UnicodeEncodeError:
|
171 |
+
translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
|
172 |
+
drawing = run._element.find(f".//{NS_W}drawing")
|
173 |
+
pict = run._element.find(".//{NS_W}pict")
|
174 |
+
|
175 |
+
# Create a new run with translated text and copy the formatting
|
176 |
+
new_run = para.add_run(translated_text)
|
177 |
+
new_run.style = run.style
|
178 |
+
|
179 |
+
if drawing is not None:
|
180 |
+
new_run._element.append(drawing)
|
181 |
+
elif pict is not None:
|
182 |
+
new_run._element.append(pict)
|
183 |
+
|
184 |
+
# Copy formatting from original run
|
185 |
+
new_run.bold = run.bold
|
186 |
+
new_run.italic = run.italic
|
187 |
+
new_run.underline = run.underline
|
188 |
+
new_run.font.size = run.font.size
|
189 |
+
new_run.font.color.rgb = run.font.color.rgb
|
190 |
+
new_run.font.name = run.font.name
|
191 |
+
table_index += 1
|
192 |
+
elif isinstance(run, docx.text.hyperlink.Hyperlink):
|
193 |
+
parent = run._element
|
194 |
+
tag = parent.tag.split("}")[-1]
|
195 |
+
|
196 |
+
# Create a new hyperlink element with the correct namespace
|
197 |
+
new_hyperlink = OxmlElement(f"w:{tag}")
|
198 |
+
for attr in parent.attrib:
|
199 |
+
new_hyperlink.set(attr, parent.get(attr))
|
200 |
+
for child in parent:
|
201 |
+
new_hyperlink.append(child)
|
202 |
+
para._element.append(new_hyperlink)
|
203 |
+
|
204 |
+
def translate_header_footer(doc, source_lang, target_lang):
|
205 |
+
head_foot = []
|
206 |
+
for section in doc.sections:
|
207 |
+
for header in section.header.paragraphs:
|
208 |
+
for run in header.runs:
|
209 |
+
head_foot.append(run.text)
|
210 |
+
for footer in section.footer.paragraphs:
|
211 |
+
for run in footer.runs:
|
212 |
+
head_foot.append(run.text)
|
213 |
+
translated_head_foot = batch_translate(head_foot, source_lang, target_lang)
|
214 |
+
|
215 |
+
i = 0
|
216 |
+
for section in doc.sections:
|
217 |
+
for header in section.header.paragraphs:
|
218 |
+
for run in header.runs:
|
219 |
+
run.text = translated_head_foot[i]
|
220 |
+
i += 1
|
221 |
+
for footer in section.footer.paragraphs:
|
222 |
+
for run in footer.runs:
|
223 |
+
run.text = translated_head_foot[i]
|
224 |
+
i += 1
|
225 |
+
|
226 |
+
def translate_docx(file_id, source_lang='en', target_lang='fr', db_name='word'):
|
227 |
+
client = MongoClient('mongodb://localhost:27017/')
|
228 |
+
db = client[db_name]
|
229 |
+
fs_input = GridFS(db, collection="root_file")
|
230 |
+
fs_output = GridFS(db, collection="final_file")
|
231 |
+
|
232 |
+
file_data = fs_input.get(file_id).read()
|
233 |
+
input_doc = Document(io.BytesIO(file_data))
|
234 |
+
|
235 |
+
translate_paragraphs(input_doc, source_lang, target_lang)
|
236 |
+
translate_tables(input_doc, source_lang, target_lang)
|
237 |
+
translate_header_footer(input_doc, source_lang, target_lang)
|
238 |
+
|
239 |
+
output_stream = io.BytesIO()
|
240 |
+
input_doc.save(output_stream)
|
241 |
+
output_stream.seek(0)
|
242 |
+
|
243 |
+
translated_file_id = fs_output.put(output_stream, filename=f"{target_lang}_translated.docx")
|
244 |
+
print(f"Translation complete! Saved with file ID: {translated_file_id}")
|
245 |
+
|
246 |
+
return translated_file_id
|