Spaces:
Running
Running
update xlsx
Browse files- excel/__pycache__/xlsx.cpython-310.pyc +0 -0
- excel/xlsx.py +430 -0
- pages/upload.py +5 -10
- powerpoint/__pycache__/pptx.cpython-310.pyc +0 -0
- powerpoint/pptx.py +0 -2
- powerpoint/pptx_object.py +0 -354
- powerpoint/pptx_processor.py +0 -50
excel/__pycache__/xlsx.cpython-310.pyc
ADDED
Binary file (11.3 kB). View file
|
|
excel/xlsx.py
ADDED
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import zipfile
|
3 |
+
import copy
|
4 |
+
import time
|
5 |
+
import xml.etree.ElementTree as ET
|
6 |
+
from typing import List, Dict, Any, Optional, Tuple
|
7 |
+
from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text
|
8 |
+
from pymongo import MongoClient
|
9 |
+
import gridfs
|
10 |
+
from io import BytesIO
|
11 |
+
import shutil
|
12 |
+
import io
|
13 |
+
|
14 |
+
NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
15 |
+
|
16 |
+
# --- Hàm đăng ký namespace (quan trọng khi ghi file) ---
|
17 |
+
def register_namespaces(xml_file):
|
18 |
+
"""Đọc và đăng ký các namespace từ file XML."""
|
19 |
+
namespaces = dict([
|
20 |
+
node for _, node in ET.iterparse(xml_file, events=['start-ns'])
|
21 |
+
])
|
22 |
+
for ns, uri in namespaces.items():
|
23 |
+
ET.register_namespace(ns, uri)
|
24 |
+
# Đăng ký thêm namespace phổ biến nếu chưa có
|
25 |
+
if 'main' not in namespaces and '' not in namespaces: # Kiểm tra cả prefix rỗng
|
26 |
+
ET.register_namespace('', NS_MAIN['main']) # Đăng ký default namespace
|
27 |
+
elif 'main' not in namespaces:
|
28 |
+
ET.register_namespace('main', NS_MAIN['main']) # Đăng ký với prefix 'main'
|
29 |
+
|
30 |
+
|
31 |
+
def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:
|
32 |
+
"""
|
33 |
+
Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text.
|
34 |
+
"""
|
35 |
+
modifiable_nodes = []
|
36 |
+
shared_strings_path = os.path.join(unzipped_folder_path, "xl", "sharedStrings.xml")
|
37 |
+
worksheets_folder = os.path.join(unzipped_folder_path, "xl", "worksheets")
|
38 |
+
shared_tree = None
|
39 |
+
sheet_trees = {}
|
40 |
+
|
41 |
+
# --- Xử lý sharedStrings.xml ---
|
42 |
+
if os.path.exists(shared_strings_path):
|
43 |
+
try:
|
44 |
+
register_namespaces(shared_strings_path)
|
45 |
+
shared_tree = ET.parse(shared_strings_path)
|
46 |
+
root_shared = shared_tree.getroot()
|
47 |
+
|
48 |
+
for si_element in root_shared.findall('main:si', NS_MAIN):
|
49 |
+
text_parts = []
|
50 |
+
t_elements = si_element.findall('.//main:t', NS_MAIN) # Tìm tất cả <t> con
|
51 |
+
|
52 |
+
# Tìm run đầu tiên (<r>) và properties (<rPr>) của nó
|
53 |
+
first_r = si_element.find('./main:r', NS_MAIN) # Tìm <r> con trực tiếp đầu tiên
|
54 |
+
first_rpr_clone = None # Lưu bản sao của <rPr> đầu tiên
|
55 |
+
is_rich_text = first_r is not None
|
56 |
+
|
57 |
+
if is_rich_text:
|
58 |
+
# Tìm <rPr> bên trong <r> đầu tiên
|
59 |
+
first_rpr = first_r.find('./main:rPr', NS_MAIN)
|
60 |
+
if first_rpr is not None:
|
61 |
+
# Sao chép sâu để không ảnh hưởng cây gốc và để dùng sau
|
62 |
+
first_rpr_clone = copy.deepcopy(first_rpr)
|
63 |
+
|
64 |
+
# Lấy toàn bộ text
|
65 |
+
for t_node in t_elements:
|
66 |
+
if t_node.text:
|
67 |
+
text_parts.append(t_node.text)
|
68 |
+
full_text = "".join(text_parts)
|
69 |
+
|
70 |
+
if not full_text: continue # Bỏ qua nếu không có text
|
71 |
+
|
72 |
+
if is_rich_text:
|
73 |
+
modifiable_nodes.append({
|
74 |
+
'type': 'shared_rich',
|
75 |
+
'original_text': full_text,
|
76 |
+
'element': si_element, # Tham chiếu <si>
|
77 |
+
'first_format': first_rpr_clone, # Lưu định dạng <rPr> đầu tiên (hoặc None)
|
78 |
+
'source_file': os.path.join("xl", "sharedStrings.xml"),
|
79 |
+
'sheet_name': None
|
80 |
+
})
|
81 |
+
elif t_elements: # Không phải rich text, tìm thẻ <t> đơn giản
|
82 |
+
first_t = si_element.find('./main:t', NS_MAIN)
|
83 |
+
if first_t is not None:
|
84 |
+
modifiable_nodes.append({
|
85 |
+
'type': 'shared_simple',
|
86 |
+
'original_text': full_text,
|
87 |
+
'element': first_t, # Tham chiếu <t>
|
88 |
+
'first_format': None, # Không có định dạng đặc biệt
|
89 |
+
'source_file': os.path.join("xl", "sharedStrings.xml"),
|
90 |
+
'sheet_name': None
|
91 |
+
})
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
print(f"Lỗi xử lý sharedStrings: {e}")
|
95 |
+
import traceback
|
96 |
+
traceback.print_exc()
|
97 |
+
|
98 |
+
# --- Xử lý các file sheetX.xml (Inline Strings - không có định dạng phức tạp) ---
|
99 |
+
if os.path.isdir(worksheets_folder):
|
100 |
+
for sheet_filename in sorted(os.listdir(worksheets_folder)):
|
101 |
+
if sheet_filename.lower().endswith(".xml"):
|
102 |
+
# ... (phần đọc và parse sheet tree như cũ) ...
|
103 |
+
sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
|
104 |
+
try:
|
105 |
+
register_namespaces(sheet_file_path)
|
106 |
+
sheet_tree = ET.parse(sheet_file_path)
|
107 |
+
sheet_trees[sheet_filename] = sheet_tree
|
108 |
+
root_sheet = sheet_tree.getroot()
|
109 |
+
for cell in root_sheet.findall('.//main:c[@t="inlineStr"]', NS_MAIN):
|
110 |
+
t_element = cell.find('.//main:is/main:t', NS_MAIN)
|
111 |
+
if t_element is not None and t_element.text is not None:
|
112 |
+
modifiable_nodes.append({
|
113 |
+
'type': 'inline',
|
114 |
+
'original_text': t_element.text,
|
115 |
+
'element': t_element, # Tham chiếu <t>
|
116 |
+
'first_format': None, # Inline string không có định dạng <rPr>
|
117 |
+
'source_file': os.path.join("xl", "worksheets", sheet_filename),
|
118 |
+
'sheet_name': sheet_filename
|
119 |
+
})
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Lỗi xử lý sheet {sheet_filename}: {e}")
|
122 |
+
import traceback
|
123 |
+
traceback.print_exc()
|
124 |
+
|
125 |
+
else:
|
126 |
+
print(f"Lỗi: Không tìm thấy thư mục worksheets: {worksheets_folder}")
|
127 |
+
|
128 |
+
|
129 |
+
global_data = {"shared_tree": shared_tree, "sheet_trees": sheet_trees, "shared_strings_path": shared_strings_path, "worksheets_folder": worksheets_folder}
|
130 |
+
return modifiable_nodes, global_data
|
131 |
+
|
132 |
+
def apply_and_save_changes(modified_nodes_data: List[Dict[str, Any]], global_data: Dict[str, Any]) -> bool:
|
133 |
+
"""
|
134 |
+
Cập nhật text, giữ lại định dạng đầu tiên cho Rich Text, và lưu file XML.
|
135 |
+
"""
|
136 |
+
if not global_data: print("Lỗi: Thiếu global_data."); return False
|
137 |
+
|
138 |
+
updated_files = set()
|
139 |
+
try: ET.register_namespace('xml', "http://www.w3.org/XML/1998/namespace")
|
140 |
+
except ValueError: pass
|
141 |
+
|
142 |
+
for node_info in modified_nodes_data:
|
143 |
+
if 'modified_text' in node_info and node_info['element'] is not None:
|
144 |
+
element = node_info['element']
|
145 |
+
modified_text = node_info['modified_text']
|
146 |
+
original_text = node_info.get('original_text', '')
|
147 |
+
node_type = node_info.get('type', '')
|
148 |
+
first_format = node_info.get('first_format') # Lấy <rPr> đã lưu (hoặc None)
|
149 |
+
|
150 |
+
if original_text != modified_text:
|
151 |
+
# --- Xử lý Rich Text: Tạo lại cấu trúc <si><r>[<rPr>]<t></r></si> ---
|
152 |
+
if node_type == 'shared_rich':
|
153 |
+
si_element = element
|
154 |
+
# Xóa con cũ
|
155 |
+
for child in list(si_element):
|
156 |
+
si_element.remove(child)
|
157 |
+
|
158 |
+
# Tạo run mới <r>
|
159 |
+
new_r = ET.Element(f"{{{NS_MAIN['main']}}}r")
|
160 |
+
|
161 |
+
# Nếu có định dạng đầu tiên (<rPr>), thêm nó vào <r> mới
|
162 |
+
if first_format is not None:
|
163 |
+
new_r.append(first_format) # Thêm bản sao <rPr> đã lưu
|
164 |
+
|
165 |
+
# Tạo thẻ text mới <t>
|
166 |
+
new_t = ET.Element(f"{{{NS_MAIN['main']}}}t")
|
167 |
+
new_t.text = modified_text
|
168 |
+
xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
|
169 |
+
new_t.set(xml_space_attr, 'preserve')
|
170 |
+
|
171 |
+
# Thêm <t> vào <r>
|
172 |
+
new_r.append(new_t)
|
173 |
+
# Thêm <r> vào <si>
|
174 |
+
si_element.append(new_r)
|
175 |
+
|
176 |
+
updated_files.add(node_info['source_file'])
|
177 |
+
# print(f"Applied first format to Rich Text in {node_info['source_file']}")
|
178 |
+
|
179 |
+
# --- Xử lý Simple/Inline Text: Cập nhật thẻ <t> ---
|
180 |
+
elif node_type in ['shared_simple', 'inline']:
|
181 |
+
t_element = element
|
182 |
+
t_element.text = modified_text
|
183 |
+
xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
|
184 |
+
if xml_space_attr not in t_element.attrib or t_element.attrib[xml_space_attr] != 'preserve':
|
185 |
+
t_element.set(xml_space_attr, 'preserve')
|
186 |
+
updated_files.add(node_info['source_file'])
|
187 |
+
# print(f"Updated Simple/Inline Text in {node_info['source_file']}")
|
188 |
+
else:
|
189 |
+
print(f"Cảnh báo: Loại node không xác định '{node_type}'")
|
190 |
+
|
191 |
+
# --- Lưu lại các file XML đã thay đổi (Giữ nguyên) ---
|
192 |
+
success = True
|
193 |
+
# ... (Phần code lưu file như cũ) ...
|
194 |
+
shared_tree = global_data.get("shared_tree"); shared_strings_path = global_data.get("shared_strings_path")
|
195 |
+
sheet_trees = global_data.get("sheet_trees", {}); worksheets_folder = global_data.get("worksheets_folder")
|
196 |
+
|
197 |
+
shared_strings_relative_path = os.path.join("xl", "sharedStrings.xml")
|
198 |
+
if shared_tree and shared_strings_path and shared_strings_relative_path in updated_files:
|
199 |
+
try:
|
200 |
+
# print(f"Saving modified file: {shared_strings_path}")
|
201 |
+
shared_tree.write(shared_strings_path, encoding='utf-8', xml_declaration=True)
|
202 |
+
except Exception as e: print(f"Lỗi lưu {shared_strings_path}: {e}"); success = False
|
203 |
+
|
204 |
+
if worksheets_folder and os.path.exists(worksheets_folder):
|
205 |
+
for sheet_filename, sheet_tree in sheet_trees.items():
|
206 |
+
sheet_relative_path = os.path.join("xl", "worksheets", sheet_filename)
|
207 |
+
if sheet_relative_path in updated_files:
|
208 |
+
sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
|
209 |
+
try:
|
210 |
+
# print(f"Saving modified file: {sheet_file_path}")
|
211 |
+
sheet_tree.write(sheet_file_path, encoding='utf-8', xml_declaration=True)
|
212 |
+
except Exception as e: print(f"Lỗi lưu {sheet_file_path}: {e}"); success = False
|
213 |
+
|
214 |
+
if success and updated_files: print(f"Đã lưu thành công {len(updated_files)} file XML đã sửa đổi (đã giữ lại định dạng đầu tiên cho Rich Text).")
|
215 |
+
elif not updated_files: print("Không có file XML nào cần cập nhật.") ; return True
|
216 |
+
return success
|
217 |
+
|
218 |
+
def zip_folder_to_excel_file(folder_path, file_name):
|
219 |
+
try:
|
220 |
+
# Nén thư mục thành file .xlsx trong RAM
|
221 |
+
xlsx_buffer = io.BytesIO()
|
222 |
+
with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
223 |
+
for root, _, files in os.walk(folder_path):
|
224 |
+
for file in files:
|
225 |
+
file_path = os.path.join(root, file)
|
226 |
+
archive_path = os.path.relpath(file_path, folder_path)
|
227 |
+
zipf.write(file_path, archive_path)
|
228 |
+
|
229 |
+
xlsx_buffer.seek(0)
|
230 |
+
|
231 |
+
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
232 |
+
db = client['excel']
|
233 |
+
fs = gridfs.GridFS(db, collection='final_file')
|
234 |
+
|
235 |
+
file_id = fs.put(xlsx_buffer.read(), filename=file_name)
|
236 |
+
print(f"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}")
|
237 |
+
return file_id
|
238 |
+
|
239 |
+
except Exception as e:
|
240 |
+
print(f"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}")
|
241 |
+
return None
|
242 |
+
|
243 |
+
|
244 |
+
def get_text_list_from_nodes(modifiable_nodes: Optional[List[Dict[str, Any]]]) -> List[str]:
|
245 |
+
if modifiable_nodes is None:
|
246 |
+
return [] # Trả về list rỗng nếu đầu vào là None
|
247 |
+
|
248 |
+
# Sử dụng list comprehension để lấy giá trị của key 'original_text' từ mỗi dictionary
|
249 |
+
text_list = [
|
250 |
+
node_info['original_text']
|
251 |
+
for node_info in modifiable_nodes
|
252 |
+
if 'original_text' in node_info and node_info['original_text'] is not None
|
253 |
+
]
|
254 |
+
# Thêm kiểm tra 'original_text' tồn tại và không phải None cho chắc chắn
|
255 |
+
|
256 |
+
return text_list
|
257 |
+
|
258 |
+
|
259 |
+
def count_words(text: str) -> int:
|
260 |
+
"""Đếm số từ trong một chuỗi bằng cách tách theo khoảng trắng."""
|
261 |
+
if not text or text.isspace():
|
262 |
+
return 0
|
263 |
+
return len(text.split())
|
264 |
+
|
265 |
+
# Helper function to process a batch of valid segments (Unchanged)
|
266 |
+
def _translate_batch_helper(segments_to_translate, original_indices_1based, source_lang, target_lang):
|
267 |
+
"""Handles preprocessing, translation, postprocessing, and error handling for a batch."""
|
268 |
+
batch_results = [None] * len(segments_to_translate)
|
269 |
+
|
270 |
+
if not segments_to_translate:
|
271 |
+
return []
|
272 |
+
|
273 |
+
try:
|
274 |
+
processed_segments = preprocess_text(segments_to_translate)
|
275 |
+
translated_segments = translate_text(processed_segments, source_lang, target_lang)
|
276 |
+
final_translated_segments = postprocess_text(translated_segments)
|
277 |
+
|
278 |
+
if len(final_translated_segments) == len(segments_to_translate):
|
279 |
+
batch_results = final_translated_segments
|
280 |
+
else:
|
281 |
+
print(f" *** CRITICAL ERROR: Batch translation result count mismatch! Expected {len(segments_to_translate)}, got {len(final_translated_segments)}. Marking batch as failed.")
|
282 |
+
error_msg = "<translation_length_mismatch_error>"
|
283 |
+
batch_results = [error_msg] * len(segments_to_translate)
|
284 |
+
|
285 |
+
except Exception as e:
|
286 |
+
print(f" *** ERROR during batch translation: {e}. Marking batch as failed.")
|
287 |
+
# traceback.print_exc() # Uncomment for detailed debug
|
288 |
+
error_msg = "<translation_api_error>"
|
289 |
+
batch_results = [error_msg] * len(segments_to_translate)
|
290 |
+
|
291 |
+
return batch_results
|
292 |
+
|
293 |
+
|
294 |
+
def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):
|
295 |
+
"""
|
296 |
+
Dịch file XLSX, chia thành batch động, dịch riêng các segment quá dài.
|
297 |
+
|
298 |
+
Args:
|
299 |
+
input_filepath (str): Đường dẫn đến file XLSX đầu vào.
|
300 |
+
output_filepath (str): Đường dẫn để lưu file XLSX đã dịch.
|
301 |
+
source_lang (str): Mã ngôn ngữ nguồn.
|
302 |
+
target_lang (str): Mã ngôn ngữ đích.
|
303 |
+
batch_size_segments (int): Số lượng đoạn text tối đa MONG MUỐN trong mỗi lần gọi API.
|
304 |
+
max_words_per_segment (int): Giới hạn từ tối đa cho một segment để được dịch theo batch.
|
305 |
+
Các segment dài hơn sẽ được dịch riêng lẻ.
|
306 |
+
delay_between_requests (int): Thời gian chờ (giây) giữa các lần gọi API dịch.
|
307 |
+
"""
|
308 |
+
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
309 |
+
db = client['excel']
|
310 |
+
fs = gridfs.GridFS(db, collection='root_file')
|
311 |
+
|
312 |
+
ppt_file = fs.get(file_id)
|
313 |
+
excel_file = BytesIO(ppt_file.read())
|
314 |
+
|
315 |
+
xml_folder = unzip_office_file(excel_file)
|
316 |
+
|
317 |
+
modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)
|
318 |
+
|
319 |
+
original_texts = get_text_list_from_nodes(modifiable_nodes)
|
320 |
+
|
321 |
+
all_results = [None] * len(original_texts)
|
322 |
+
current_index = 0
|
323 |
+
processed_count = 0
|
324 |
+
api_call_counter = 0 # Track API calls for delay logic
|
325 |
+
|
326 |
+
while current_index < len(original_texts):
|
327 |
+
batch_texts_to_translate = []
|
328 |
+
batch_original_indices = [] # 0-based indices for assignment
|
329 |
+
batch_end_index = min(current_index + batch_size_segments, len(original_texts))
|
330 |
+
found_long_segment_at = -1 # 0-based index in original_texts
|
331 |
+
|
332 |
+
# 1. Build the next potential batch, stopping if a long segment is found
|
333 |
+
for i in range(current_index, batch_end_index):
|
334 |
+
segment = original_texts[i]
|
335 |
+
word_count = count_words(segment)
|
336 |
+
|
337 |
+
if word_count <= max_words_per_segment:
|
338 |
+
batch_texts_to_translate.append(segment)
|
339 |
+
batch_original_indices.append(i)
|
340 |
+
else:
|
341 |
+
found_long_segment_at = i
|
342 |
+
break # Stop building this batch
|
343 |
+
|
344 |
+
# --- Process the findings ---
|
345 |
+
|
346 |
+
# 2. Translate the VALID batch collected *before* the long segment (if any)
|
347 |
+
if batch_texts_to_translate:
|
348 |
+
# Add delay BEFORE the API call if it's not the very first call
|
349 |
+
if api_call_counter > 0 and delay_between_requests > 0:
|
350 |
+
time.sleep(delay_between_requests)
|
351 |
+
|
352 |
+
translated_batch = _translate_batch_helper(
|
353 |
+
batch_texts_to_translate,
|
354 |
+
[idx + 1 for idx in batch_original_indices], # 1-based for logging
|
355 |
+
source_lang,
|
356 |
+
target_lang
|
357 |
+
)
|
358 |
+
api_call_counter += 1
|
359 |
+
# Assign results back
|
360 |
+
for batch_idx, original_idx in enumerate(batch_original_indices):
|
361 |
+
all_results[original_idx] = translated_batch[batch_idx]
|
362 |
+
processed_count += len(batch_texts_to_translate)
|
363 |
+
|
364 |
+
# 3. Handle the long segment INDIVIDUALLY (if one was found)
|
365 |
+
if found_long_segment_at != -1:
|
366 |
+
long_segment_index = found_long_segment_at
|
367 |
+
long_segment_text = str(original_texts[long_segment_index])
|
368 |
+
# word_count = count_words(long_segment_text) # Recalculate for log clarity
|
369 |
+
|
370 |
+
try:
|
371 |
+
translated = translate_single_text(long_segment_text, source_lang, target_lang)
|
372 |
+
|
373 |
+
final = [translated]
|
374 |
+
api_call_counter += 1
|
375 |
+
|
376 |
+
if len(final) == 1:
|
377 |
+
all_results[long_segment_index] = final[0]
|
378 |
+
else:
|
379 |
+
print(f" *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.")
|
380 |
+
all_results[long_segment_index] = "<translation_length_mismatch_error>"
|
381 |
+
|
382 |
+
except Exception as e:
|
383 |
+
print(f" *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.")
|
384 |
+
# traceback.print_exc() # Uncomment for detailed debug
|
385 |
+
all_results[long_segment_index] = "<translation_api_error>"
|
386 |
+
# Do not increment api_call_counter if the API call itself failed before returning
|
387 |
+
|
388 |
+
processed_count += 1
|
389 |
+
# Update current_index to start AFTER this long segment
|
390 |
+
current_index = long_segment_index + 1
|
391 |
+
|
392 |
+
else:
|
393 |
+
# No long segment was found in the range checked.
|
394 |
+
# Move current_index to the end of the range examined.
|
395 |
+
current_index = batch_end_index
|
396 |
+
|
397 |
+
missing_count = 0
|
398 |
+
final_texts_for_nodes = []
|
399 |
+
for i, res in enumerate(all_results):
|
400 |
+
if res is None:
|
401 |
+
print(f"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'")
|
402 |
+
final_texts_for_nodes.append(original_texts[i])
|
403 |
+
missing_count += 1
|
404 |
+
else:
|
405 |
+
final_texts_for_nodes.append(res)
|
406 |
+
|
407 |
+
if missing_count > 0:
|
408 |
+
print(f"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.")
|
409 |
+
|
410 |
+
if len(final_texts_for_nodes) != len(original_texts):
|
411 |
+
print(f"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.")
|
412 |
+
else:
|
413 |
+
# Gán vào node
|
414 |
+
for i, node_info in enumerate(modifiable_nodes):
|
415 |
+
node_info['modified_text'] = final_texts_for_nodes[i]
|
416 |
+
|
417 |
+
save_success = apply_and_save_changes(modifiable_nodes, global_data)
|
418 |
+
if not save_success:
|
419 |
+
print("LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.")
|
420 |
+
else:
|
421 |
+
# Only zip if saving XML was successful
|
422 |
+
final_id = zip_folder_to_excel_file(xml_folder, file_name)
|
423 |
+
if final_id:
|
424 |
+
shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping
|
425 |
+
else:
|
426 |
+
print("LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.")
|
427 |
+
return final_id
|
428 |
+
|
429 |
+
|
430 |
+
|
pages/upload.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
import google.generativeai as genai
|
3 |
from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
|
4 |
from powerpoint.pptx import translate_pptx
|
5 |
-
from excel.
|
6 |
from word.word_translate import translate_docx_from_mongodb
|
7 |
import dotenv
|
8 |
import os
|
@@ -26,16 +26,11 @@ def process_file(file, file_type):
|
|
26 |
|
27 |
if file_type == "PPTX":
|
28 |
final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
|
29 |
-
|
30 |
-
# text_dict = extract_text_from_xml(file_id=xml_file_id)
|
31 |
-
# translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
32 |
-
# progress_bar.progress(60)
|
33 |
-
# final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
34 |
-
# final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
|
35 |
elif file_type == "Excel":
|
36 |
-
final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
|
37 |
-
elif file_type == "CSV":
|
38 |
-
|
39 |
elif file_type == "Word":
|
40 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
41 |
else:
|
|
|
2 |
import google.generativeai as genai
|
3 |
from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
|
4 |
from powerpoint.pptx import translate_pptx
|
5 |
+
from excel.xlsx import translate_xlsx
|
6 |
from word.word_translate import translate_docx_from_mongodb
|
7 |
import dotenv
|
8 |
import os
|
|
|
26 |
|
27 |
if file_type == "PPTX":
|
28 |
final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
|
29 |
+
progress_bar.progress(60)
|
|
|
|
|
|
|
|
|
|
|
30 |
elif file_type == "Excel":
|
31 |
+
final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
|
32 |
+
# elif file_type == "CSV":
|
33 |
+
# final_id = translate_csv(file_id = file_id, target_lang = target_lang)
|
34 |
elif file_type == "Word":
|
35 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
36 |
else:
|
powerpoint/__pycache__/pptx.cpython-310.pyc
CHANGED
Binary files a/powerpoint/__pycache__/pptx.cpython-310.pyc and b/powerpoint/__pycache__/pptx.cpython-310.pyc differ
|
|
powerpoint/pptx.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
import os
|
2 |
import zipfile
|
3 |
import shutil
|
4 |
-
from pptx import Presentation
|
5 |
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
|
6 |
from powerpoint.xml_handling import *
|
7 |
from pymongo import MongoClient
|
8 |
import gridfs
|
9 |
-
from bson import ObjectId
|
10 |
from io import BytesIO
|
11 |
|
12 |
def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
|
|
|
1 |
import os
|
2 |
import zipfile
|
3 |
import shutil
|
|
|
4 |
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
|
5 |
from powerpoint.xml_handling import *
|
6 |
from pymongo import MongoClient
|
7 |
import gridfs
|
|
|
8 |
from io import BytesIO
|
9 |
|
10 |
def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
|
powerpoint/pptx_object.py
DELETED
@@ -1,354 +0,0 @@
|
|
1 |
-
# ppt_objects.py
|
2 |
-
from pptx import Presentation
|
3 |
-
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
|
4 |
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
5 |
-
import xml.etree.ElementTree as ET
|
6 |
-
from pptx.util import Pt
|
7 |
-
from pptx.dml.color import RGBColor
|
8 |
-
import re
|
9 |
-
import json
|
10 |
-
|
11 |
-
from pymongo import MongoClient
|
12 |
-
from gridfs import GridFS
|
13 |
-
import json
|
14 |
-
import xml.etree.ElementTree as ET
|
15 |
-
from io import BytesIO
|
16 |
-
|
17 |
-
|
18 |
-
def apply_group_properties_recursive(shape, shape_index, parent_element):
|
19 |
-
"""Recursively applies properties to shapes within groups."""
|
20 |
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
21 |
-
group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
|
22 |
-
if group_element is not None:
|
23 |
-
for i, sub_shape in enumerate(shape.shapes):
|
24 |
-
apply_group_properties_recursive(sub_shape, i, group_element)
|
25 |
-
|
26 |
-
# Apply properties for sub-shapes WITHIN the group, based on their type.
|
27 |
-
if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
28 |
-
table_element = group_element.find(f".//table_element[@shape_index='{i}']")
|
29 |
-
if table_element: # Use a shorter name for clarity
|
30 |
-
props_element = table_element.find("properties")
|
31 |
-
if props_element is not None and props_element.text:
|
32 |
-
try:
|
33 |
-
table_data = json.loads(props_element.text)
|
34 |
-
apply_table_properties(sub_shape.table, table_data)
|
35 |
-
except (json.JSONDecodeError, KeyError) as e:
|
36 |
-
print(f"Error applying table properties (in group): {str(e)}")
|
37 |
-
|
38 |
-
elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
|
39 |
-
text_element = group_element.find(f".//text_element[@shape_index='{i}']")
|
40 |
-
if text_element: # Shorter name
|
41 |
-
props_element = text_element.find("properties")
|
42 |
-
if props_element is not None and props_element.text:
|
43 |
-
try:
|
44 |
-
shape_data = json.loads(props_element.text)
|
45 |
-
apply_shape_properties(sub_shape, shape_data)
|
46 |
-
except (json.JSONDecodeError, KeyError) as e:
|
47 |
-
print(f"Error applying shape properties (in group): {str(e)}")
|
48 |
-
|
49 |
-
def get_alignment_value(alignment_str):
|
50 |
-
"""Convert alignment string (with extra characters) to PP_ALIGN enum value."""
|
51 |
-
alignment_map = {
|
52 |
-
'center': PP_ALIGN.CENTER,
|
53 |
-
'left': PP_ALIGN.LEFT,
|
54 |
-
'right': PP_ALIGN.RIGHT,
|
55 |
-
'justify': PP_ALIGN.JUSTIFY
|
56 |
-
}
|
57 |
-
match = re.match(r"([A-Za-z]+)", alignment_str)
|
58 |
-
return alignment_map.get(match.group(1).lower()) if match else None
|
59 |
-
|
60 |
-
def get_vertical_anchor(value):
|
61 |
-
"""Converts vertical_anchor string to MSO_ANCHOR enum."""
|
62 |
-
mapping = {
|
63 |
-
"TOP": MSO_ANCHOR.TOP,
|
64 |
-
"MIDDLE": MSO_ANCHOR.MIDDLE,
|
65 |
-
"BOTTOM": MSO_ANCHOR.BOTTOM
|
66 |
-
}
|
67 |
-
return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
|
68 |
-
|
69 |
-
def get_table_properties(table):
|
70 |
-
"""Extract complete table properties."""
|
71 |
-
table_data = {
|
72 |
-
'rows': len(table.rows),
|
73 |
-
'cols': len(table.columns),
|
74 |
-
'cells': []
|
75 |
-
}
|
76 |
-
for row in table.rows:
|
77 |
-
row_data = []
|
78 |
-
for cell in row.cells:
|
79 |
-
cell_data = {
|
80 |
-
'text': cell.text.strip(),
|
81 |
-
'font_size': None,
|
82 |
-
'font_name': None,
|
83 |
-
'alignment': None,
|
84 |
-
'margin_left': cell.margin_left,
|
85 |
-
'margin_right': cell.margin_right,
|
86 |
-
'margin_top': cell.margin_top,
|
87 |
-
'margin_bottom': cell.margin_bottom,
|
88 |
-
'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
|
89 |
-
'font_color': None
|
90 |
-
}
|
91 |
-
if cell.text_frame.paragraphs:
|
92 |
-
paragraph = cell.text_frame.paragraphs[0]
|
93 |
-
if paragraph.runs:
|
94 |
-
run = paragraph.runs[0]
|
95 |
-
if hasattr(run.font, 'size') and run.font.size is not None:
|
96 |
-
cell_data['font_size'] = run.font.size.pt
|
97 |
-
if hasattr(run.font, 'name'):
|
98 |
-
cell_data['font_name'] = run.font.name
|
99 |
-
if hasattr(run.font, 'bold'):
|
100 |
-
cell_data['bold'] = run.font.bold
|
101 |
-
if hasattr(run.font, 'italic'):
|
102 |
-
cell_data['italic'] = run.font.italic
|
103 |
-
if (hasattr(run.font, 'color') and
|
104 |
-
run.font.color is not None and
|
105 |
-
hasattr(run.font.color, 'rgb') and
|
106 |
-
run.font.color.rgb is not None):
|
107 |
-
cell_data['font_color'] = str(run.font.color.rgb)
|
108 |
-
if hasattr(paragraph, 'alignment'):
|
109 |
-
cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
|
110 |
-
row_data.append(cell_data)
|
111 |
-
table_data['cells'].append(row_data)
|
112 |
-
return table_data
|
113 |
-
|
114 |
-
def get_shape_properties(shape):
|
115 |
-
"""Extract all properties from a shape, with detailed debug prints."""
|
116 |
-
shape_data = {
|
117 |
-
'text': '',
|
118 |
-
'font_size': None,
|
119 |
-
'font_name': None,
|
120 |
-
'alignment': None,
|
121 |
-
'width': shape.width,
|
122 |
-
'height': shape.height,
|
123 |
-
'left': shape.left,
|
124 |
-
'top': shape.top,
|
125 |
-
'bold': None,
|
126 |
-
'italic': None,
|
127 |
-
'line_spacing_info': {
|
128 |
-
'rule': None,
|
129 |
-
'value': None
|
130 |
-
},
|
131 |
-
'space_before': None,
|
132 |
-
'space_after': None,
|
133 |
-
'font_color': None
|
134 |
-
}
|
135 |
-
|
136 |
-
if hasattr(shape, "text"):
|
137 |
-
shape_data['text'] = shape.text.strip()
|
138 |
-
if hasattr(shape, 'text_frame'):
|
139 |
-
for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
|
140 |
-
if paragraph.runs:
|
141 |
-
run = paragraph.runs[0] # Assuming properties are mostly consistent in the first run
|
142 |
-
if hasattr(run.font, 'size') and run.font.size is not None:
|
143 |
-
shape_data['font_size'] = run.font.size.pt
|
144 |
-
if hasattr(run.font, 'name'):
|
145 |
-
shape_data['font_name'] = run.font.name
|
146 |
-
if hasattr(run.font, 'bold'):
|
147 |
-
shape_data['bold'] = run.font.bold
|
148 |
-
if hasattr(run.font, 'italic'):
|
149 |
-
shape_data['italic'] = run.font.italic
|
150 |
-
if (hasattr(run.font, 'color') and
|
151 |
-
run.font.color is not None and
|
152 |
-
hasattr(run.font.color, 'rgb') and
|
153 |
-
run.font.color.rgb is not None):
|
154 |
-
shape_data['font_color'] = str(run.font.color.rgb)
|
155 |
-
|
156 |
-
if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
|
157 |
-
shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
|
158 |
-
if hasattr(paragraph, 'space_before'):
|
159 |
-
shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
|
160 |
-
if hasattr(paragraph, 'space_after'):
|
161 |
-
shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
|
162 |
-
|
163 |
-
if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
|
164 |
-
line_spacing = paragraph.line_spacing
|
165 |
-
|
166 |
-
# Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
|
167 |
-
if isinstance(line_spacing, Pt) or line_spacing > 10:
|
168 |
-
line_spacing_rule = "EXACTLY"
|
169 |
-
elif isinstance(line_spacing, float):
|
170 |
-
line_spacing_rule = "MULTIPLE"
|
171 |
-
else:
|
172 |
-
line_spacing_rule = "UNKNOWN"
|
173 |
-
|
174 |
-
shape_data['line_spacing_info'] = {
|
175 |
-
'rule': line_spacing_rule,
|
176 |
-
'value': line_spacing if isinstance(line_spacing, float) else None
|
177 |
-
}
|
178 |
-
|
179 |
-
return shape_data
|
180 |
-
|
181 |
-
def apply_shape_properties(shape, shape_data):
|
182 |
-
"""Apply saved properties to a shape."""
|
183 |
-
try:
|
184 |
-
shape.width = shape_data['width']
|
185 |
-
shape.height = shape_data['height']
|
186 |
-
shape.left = shape_data['left']
|
187 |
-
shape.top = shape_data['top']
|
188 |
-
shape.text = ""
|
189 |
-
paragraph = shape.text_frame.paragraphs[0]
|
190 |
-
run = paragraph.add_run()
|
191 |
-
run.text = shape_data['text']
|
192 |
-
if shape_data['font_size']:
|
193 |
-
adjusted_size = shape_data['font_size'] * 0.9
|
194 |
-
run.font.size = Pt(adjusted_size)
|
195 |
-
|
196 |
-
if shape_data.get('font_name'):
|
197 |
-
run.font.name = shape_data['font_name']
|
198 |
-
else:
|
199 |
-
run.font.name = "Arial"
|
200 |
-
if shape_data.get('font_color'):
|
201 |
-
run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
|
202 |
-
if shape_data['bold'] is not None:
|
203 |
-
run.font.bold = shape_data['bold']
|
204 |
-
if shape_data['italic'] is not None:
|
205 |
-
run.font.italic = shape_data['italic']
|
206 |
-
if shape_data['alignment']:
|
207 |
-
paragraph.alignment = get_alignment_value(shape_data['alignment'])
|
208 |
-
|
209 |
-
line_spacing_info = shape_data.get('line_spacing_info', {})
|
210 |
-
line_spacing_rule = line_spacing_info.get('rule')
|
211 |
-
line_spacing_value = line_spacing_info.get('value')
|
212 |
-
|
213 |
-
if line_spacing_rule and line_spacing_value is not None:
|
214 |
-
if line_spacing_rule == "EXACTLY":
|
215 |
-
paragraph.line_spacing = Pt(line_spacing_value)
|
216 |
-
elif line_spacing_rule == "AT_LEAST":
|
217 |
-
paragraph.line_spacing = Pt(line_spacing_value)
|
218 |
-
elif line_spacing_rule == "MULTIPLE":
|
219 |
-
paragraph.line_spacing = line_spacing_value
|
220 |
-
else:
|
221 |
-
print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
|
222 |
-
|
223 |
-
if shape_data['space_before']:
|
224 |
-
paragraph.space_before = shape_data['space_before']
|
225 |
-
if shape_data['space_after']:
|
226 |
-
paragraph.space_after = shape_data['space_after']
|
227 |
-
|
228 |
-
|
229 |
-
except Exception as e:
|
230 |
-
print(f"Error applying shape properties: {str(e)}")
|
231 |
-
|
232 |
-
|
233 |
-
def apply_table_properties(table, table_data):
|
234 |
-
"""Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
|
235 |
-
for row_idx, row in enumerate(table.rows):
|
236 |
-
for col_idx, cell in enumerate(row.cells):
|
237 |
-
try:
|
238 |
-
cell_data = table_data['cells'][row_idx][col_idx]
|
239 |
-
|
240 |
-
# Áp dụng margin
|
241 |
-
cell.margin_left = cell_data.get('margin_left', 0)
|
242 |
-
cell.margin_right = cell_data.get('margin_right', 0)
|
243 |
-
cell.margin_top = cell_data.get('margin_top', 0)
|
244 |
-
cell.margin_bottom = cell_data.get('margin_bottom', 0)
|
245 |
-
|
246 |
-
# Áp dụng vertical_anchor (tránh dùng eval)
|
247 |
-
if 'vertical_anchor' in cell_data:
|
248 |
-
cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
|
249 |
-
|
250 |
-
# Xóa nội dung cũ và thiết lập văn bản mới
|
251 |
-
cell.text = ""
|
252 |
-
paragraph = cell.text_frame.paragraphs[0]
|
253 |
-
run = paragraph.add_run()
|
254 |
-
run.text = cell_data.get('text', "")
|
255 |
-
|
256 |
-
# Thiết lập kích thước font
|
257 |
-
if 'font_size' in cell_data:
|
258 |
-
adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
|
259 |
-
run.font.size = Pt(adjusted_size)
|
260 |
-
|
261 |
-
# Thiết lập font chữ
|
262 |
-
run.font.name = cell_data.get('font_name', 'Arial')
|
263 |
-
|
264 |
-
# Màu chữ
|
265 |
-
if 'font_color' in cell_data:
|
266 |
-
run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
|
267 |
-
|
268 |
-
# In đậm & in nghiêng
|
269 |
-
run.font.bold = cell_data.get('bold', False)
|
270 |
-
run.font.italic = cell_data.get('italic', False)
|
271 |
-
|
272 |
-
# Căn lề văn bản
|
273 |
-
if 'alignment' in cell_data:
|
274 |
-
paragraph.alignment = get_alignment_value(cell_data['alignment'])
|
275 |
-
|
276 |
-
except Exception as e:
|
277 |
-
print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
|
278 |
-
|
279 |
-
|
280 |
-
def get_file_from_mongodb(db_name, collection_name, file_id):
|
281 |
-
"""Tải tệp từ MongoDB GridFS"""
|
282 |
-
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
283 |
-
db = client[db_name]
|
284 |
-
fs = GridFS(db, collection_name)
|
285 |
-
file_data = fs.get(file_id)
|
286 |
-
return file_data
|
287 |
-
# return BytesIO(file_data.read())
|
288 |
-
|
289 |
-
|
290 |
-
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
291 |
-
"""Lưu tệp vào MongoDB GridFS"""
|
292 |
-
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
293 |
-
db = client[db_name]
|
294 |
-
fs = GridFS(db, collection_name)
|
295 |
-
file_id = fs.put(file_data, filename=file_name)
|
296 |
-
client.close()
|
297 |
-
return file_id
|
298 |
-
|
299 |
-
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
300 |
-
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
301 |
-
try:
|
302 |
-
# Kết nối MongoDB và tải file
|
303 |
-
original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
|
304 |
-
translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
|
305 |
-
|
306 |
-
# Load PowerPoint gốc và XML dịch
|
307 |
-
prs = Presentation(BytesIO(original_ppt.read()))
|
308 |
-
tree = ET.parse(BytesIO(translated_xml.read()))
|
309 |
-
root = tree.getroot()
|
310 |
-
|
311 |
-
# Áp dụng bản dịch
|
312 |
-
for slide_number, slide in enumerate(prs.slides, 1):
|
313 |
-
xml_slide = root.find(f".//slide[@number='{slide_number}']")
|
314 |
-
if xml_slide is None:
|
315 |
-
continue
|
316 |
-
for shape_index, shape in enumerate(slide.shapes):
|
317 |
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
318 |
-
apply_group_properties_recursive(shape, shape_index, xml_slide)
|
319 |
-
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
320 |
-
table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
|
321 |
-
if table_element is not None:
|
322 |
-
props_element = table_element.find("properties")
|
323 |
-
if props_element is not None and props_element.text:
|
324 |
-
try:
|
325 |
-
table_data = json.loads(props_element.text)
|
326 |
-
apply_table_properties(shape.table, table_data)
|
327 |
-
except Exception as e:
|
328 |
-
print(f"Error applying table properties: {str(e)}")
|
329 |
-
elif hasattr(shape, "text"):
|
330 |
-
text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
|
331 |
-
if text_element is not None:
|
332 |
-
props_element = text_element.find("properties")
|
333 |
-
if props_element is not None and props_element.text:
|
334 |
-
try:
|
335 |
-
shape_data = json.loads(props_element.text)
|
336 |
-
apply_shape_properties(shape, shape_data)
|
337 |
-
except Exception as e:
|
338 |
-
print(f"Error applying shape properties: {str(e)}")
|
339 |
-
|
340 |
-
# Lưu PowerPoint vào MongoDB với tên gốc
|
341 |
-
output_io = BytesIO()
|
342 |
-
prs.save(output_io)
|
343 |
-
output_io.seek(0) # Reset vị trí đọc
|
344 |
-
|
345 |
-
# Giữ nguyên tên file gốc, thêm hậu tố "_translated"
|
346 |
-
translated_filename = original_ppt.filename.replace(".xml", ".pptx")
|
347 |
-
|
348 |
-
file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
|
349 |
-
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
350 |
-
|
351 |
-
return file_id
|
352 |
-
except Exception as e:
|
353 |
-
print(f"Error creating translated PowerPoint: {str(e)}")
|
354 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
powerpoint/pptx_processor.py
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
# ppt_processor.py
|
2 |
-
from pathlib import Path
|
3 |
-
from xml_handling import ppt_to_xml, translate_xml_file
|
4 |
-
from pptx_object import create_translated_ppt
|
5 |
-
import os
|
6 |
-
|
7 |
-
def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
|
8 |
-
"""Process a single PPT/PPTX file from XML extraction to final translation."""
|
9 |
-
ppt_path = ppt_path.strip("'\"")
|
10 |
-
ppt_path = ppt_path.replace("\\ ", " ")
|
11 |
-
ppt_path = ppt_path.replace("\\'", "'")
|
12 |
-
ppt_path = os.path.expanduser(ppt_path)
|
13 |
-
ppt_path = Path(ppt_path).resolve()
|
14 |
-
# chuyển thành link DB trên server
|
15 |
-
try:
|
16 |
-
if not ppt_path.is_file():
|
17 |
-
print(f"Error: '{ppt_path}' is not a valid file.")
|
18 |
-
return
|
19 |
-
if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
|
20 |
-
print(f"Error: '{ppt_path}' is not a PowerPoint file.")
|
21 |
-
return
|
22 |
-
|
23 |
-
base_dir = ppt_path.parent
|
24 |
-
|
25 |
-
# Original XML
|
26 |
-
print(f"Generating original XML for {ppt_path.name}...")
|
27 |
-
original_xml = ppt_to_xml(str(ppt_path))
|
28 |
-
if original_xml:
|
29 |
-
original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
|
30 |
-
with open(original_output_path, 'w', encoding='utf-8') as f:
|
31 |
-
f.write(original_xml)
|
32 |
-
print(f"Original XML saved: {original_output_path}")
|
33 |
-
|
34 |
-
# Save original XML to MongoDB
|
35 |
-
# save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
|
36 |
-
|
37 |
-
# Translated XML
|
38 |
-
print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
|
39 |
-
translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
|
40 |
-
original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
|
41 |
-
translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
|
42 |
-
|
43 |
-
# Create Translated PPT
|
44 |
-
print(f"Creating translated PPT for {ppt_path.name}...")
|
45 |
-
output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
|
46 |
-
output_ppt_path = base_dir / output_filename
|
47 |
-
create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
|
48 |
-
|
49 |
-
except Exception as e:
|
50 |
-
print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|