mintlee commited on
Commit
4d84219
·
1 Parent(s): 73196e5

update xlsx

Browse files
excel/__pycache__/xlsx.cpython-310.pyc ADDED
Binary file (11.3 kB). View file
 
excel/xlsx.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import copy
4
+ import time
5
+ import xml.etree.ElementTree as ET
6
+ from typing import List, Dict, Any, Optional, Tuple
7
+ from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text
8
+ from pymongo import MongoClient
9
+ import gridfs
10
+ from io import BytesIO
11
+ import shutil
12
+ import io
13
+
14
+ NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
15
+
16
+ # --- Hàm đăng ký namespace (quan trọng khi ghi file) ---
17
+ def register_namespaces(xml_file):
18
+ """Đọc và đăng ký các namespace từ file XML."""
19
+ namespaces = dict([
20
+ node for _, node in ET.iterparse(xml_file, events=['start-ns'])
21
+ ])
22
+ for ns, uri in namespaces.items():
23
+ ET.register_namespace(ns, uri)
24
+ # Đăng ký thêm namespace phổ biến nếu chưa có
25
+ if 'main' not in namespaces and '' not in namespaces: # Kiểm tra cả prefix rỗng
26
+ ET.register_namespace('', NS_MAIN['main']) # Đăng ký default namespace
27
+ elif 'main' not in namespaces:
28
+ ET.register_namespace('main', NS_MAIN['main']) # Đăng ký với prefix 'main'
29
+
30
+
31
+ def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:
32
+ """
33
+ Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text.
34
+ """
35
+ modifiable_nodes = []
36
+ shared_strings_path = os.path.join(unzipped_folder_path, "xl", "sharedStrings.xml")
37
+ worksheets_folder = os.path.join(unzipped_folder_path, "xl", "worksheets")
38
+ shared_tree = None
39
+ sheet_trees = {}
40
+
41
+ # --- Xử lý sharedStrings.xml ---
42
+ if os.path.exists(shared_strings_path):
43
+ try:
44
+ register_namespaces(shared_strings_path)
45
+ shared_tree = ET.parse(shared_strings_path)
46
+ root_shared = shared_tree.getroot()
47
+
48
+ for si_element in root_shared.findall('main:si', NS_MAIN):
49
+ text_parts = []
50
+ t_elements = si_element.findall('.//main:t', NS_MAIN) # Tìm tất cả <t> con
51
+
52
+ # Tìm run đầu tiên (<r>) và properties (<rPr>) của nó
53
+ first_r = si_element.find('./main:r', NS_MAIN) # Tìm <r> con trực tiếp đầu tiên
54
+ first_rpr_clone = None # Lưu bản sao của <rPr> đầu tiên
55
+ is_rich_text = first_r is not None
56
+
57
+ if is_rich_text:
58
+ # Tìm <rPr> bên trong <r> đầu tiên
59
+ first_rpr = first_r.find('./main:rPr', NS_MAIN)
60
+ if first_rpr is not None:
61
+ # Sao chép sâu để không ảnh hưởng cây gốc và để dùng sau
62
+ first_rpr_clone = copy.deepcopy(first_rpr)
63
+
64
+ # Lấy toàn bộ text
65
+ for t_node in t_elements:
66
+ if t_node.text:
67
+ text_parts.append(t_node.text)
68
+ full_text = "".join(text_parts)
69
+
70
+ if not full_text: continue # Bỏ qua nếu không có text
71
+
72
+ if is_rich_text:
73
+ modifiable_nodes.append({
74
+ 'type': 'shared_rich',
75
+ 'original_text': full_text,
76
+ 'element': si_element, # Tham chiếu <si>
77
+ 'first_format': first_rpr_clone, # Lưu định dạng <rPr> đầu tiên (hoặc None)
78
+ 'source_file': os.path.join("xl", "sharedStrings.xml"),
79
+ 'sheet_name': None
80
+ })
81
+ elif t_elements: # Không phải rich text, tìm thẻ <t> đơn giản
82
+ first_t = si_element.find('./main:t', NS_MAIN)
83
+ if first_t is not None:
84
+ modifiable_nodes.append({
85
+ 'type': 'shared_simple',
86
+ 'original_text': full_text,
87
+ 'element': first_t, # Tham chiếu <t>
88
+ 'first_format': None, # Không có định dạng đặc biệt
89
+ 'source_file': os.path.join("xl", "sharedStrings.xml"),
90
+ 'sheet_name': None
91
+ })
92
+
93
+ except Exception as e:
94
+ print(f"Lỗi xử lý sharedStrings: {e}")
95
+ import traceback
96
+ traceback.print_exc()
97
+
98
+ # --- Xử lý các file sheetX.xml (Inline Strings - không có định dạng phức tạp) ---
99
+ if os.path.isdir(worksheets_folder):
100
+ for sheet_filename in sorted(os.listdir(worksheets_folder)):
101
+ if sheet_filename.lower().endswith(".xml"):
102
+ # ... (phần đọc và parse sheet tree như cũ) ...
103
+ sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
104
+ try:
105
+ register_namespaces(sheet_file_path)
106
+ sheet_tree = ET.parse(sheet_file_path)
107
+ sheet_trees[sheet_filename] = sheet_tree
108
+ root_sheet = sheet_tree.getroot()
109
+ for cell in root_sheet.findall('.//main:c[@t="inlineStr"]', NS_MAIN):
110
+ t_element = cell.find('.//main:is/main:t', NS_MAIN)
111
+ if t_element is not None and t_element.text is not None:
112
+ modifiable_nodes.append({
113
+ 'type': 'inline',
114
+ 'original_text': t_element.text,
115
+ 'element': t_element, # Tham chiếu <t>
116
+ 'first_format': None, # Inline string không có định dạng <rPr>
117
+ 'source_file': os.path.join("xl", "worksheets", sheet_filename),
118
+ 'sheet_name': sheet_filename
119
+ })
120
+ except Exception as e:
121
+ print(f"Lỗi xử lý sheet {sheet_filename}: {e}")
122
+ import traceback
123
+ traceback.print_exc()
124
+
125
+ else:
126
+ print(f"Lỗi: Không tìm thấy thư mục worksheets: {worksheets_folder}")
127
+
128
+
129
+ global_data = {"shared_tree": shared_tree, "sheet_trees": sheet_trees, "shared_strings_path": shared_strings_path, "worksheets_folder": worksheets_folder}
130
+ return modifiable_nodes, global_data
131
+
132
+ def apply_and_save_changes(modified_nodes_data: List[Dict[str, Any]], global_data: Dict[str, Any]) -> bool:
133
+ """
134
+ Cập nhật text, giữ lại định dạng đầu tiên cho Rich Text, và lưu file XML.
135
+ """
136
+ if not global_data: print("Lỗi: Thiếu global_data."); return False
137
+
138
+ updated_files = set()
139
+ try: ET.register_namespace('xml', "http://www.w3.org/XML/1998/namespace")
140
+ except ValueError: pass
141
+
142
+ for node_info in modified_nodes_data:
143
+ if 'modified_text' in node_info and node_info['element'] is not None:
144
+ element = node_info['element']
145
+ modified_text = node_info['modified_text']
146
+ original_text = node_info.get('original_text', '')
147
+ node_type = node_info.get('type', '')
148
+ first_format = node_info.get('first_format') # Lấy <rPr> đã lưu (hoặc None)
149
+
150
+ if original_text != modified_text:
151
+ # --- Xử lý Rich Text: Tạo lại cấu trúc <si><r>[<rPr>]<t></r></si> ---
152
+ if node_type == 'shared_rich':
153
+ si_element = element
154
+ # Xóa con cũ
155
+ for child in list(si_element):
156
+ si_element.remove(child)
157
+
158
+ # Tạo run mới <r>
159
+ new_r = ET.Element(f"{{{NS_MAIN['main']}}}r")
160
+
161
+ # Nếu có định dạng đầu tiên (<rPr>), thêm nó vào <r> mới
162
+ if first_format is not None:
163
+ new_r.append(first_format) # Thêm bản sao <rPr> đã lưu
164
+
165
+ # Tạo thẻ text mới <t>
166
+ new_t = ET.Element(f"{{{NS_MAIN['main']}}}t")
167
+ new_t.text = modified_text
168
+ xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
169
+ new_t.set(xml_space_attr, 'preserve')
170
+
171
+ # Thêm <t> vào <r>
172
+ new_r.append(new_t)
173
+ # Thêm <r> vào <si>
174
+ si_element.append(new_r)
175
+
176
+ updated_files.add(node_info['source_file'])
177
+ # print(f"Applied first format to Rich Text in {node_info['source_file']}")
178
+
179
+ # --- Xử lý Simple/Inline Text: Cập nhật thẻ <t> ---
180
+ elif node_type in ['shared_simple', 'inline']:
181
+ t_element = element
182
+ t_element.text = modified_text
183
+ xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
184
+ if xml_space_attr not in t_element.attrib or t_element.attrib[xml_space_attr] != 'preserve':
185
+ t_element.set(xml_space_attr, 'preserve')
186
+ updated_files.add(node_info['source_file'])
187
+ # print(f"Updated Simple/Inline Text in {node_info['source_file']}")
188
+ else:
189
+ print(f"Cảnh báo: Loại node không xác định '{node_type}'")
190
+
191
+ # --- Lưu lại các file XML đã thay đổi (Giữ nguyên) ---
192
+ success = True
193
+ # ... (Phần code lưu file như cũ) ...
194
+ shared_tree = global_data.get("shared_tree"); shared_strings_path = global_data.get("shared_strings_path")
195
+ sheet_trees = global_data.get("sheet_trees", {}); worksheets_folder = global_data.get("worksheets_folder")
196
+
197
+ shared_strings_relative_path = os.path.join("xl", "sharedStrings.xml")
198
+ if shared_tree and shared_strings_path and shared_strings_relative_path in updated_files:
199
+ try:
200
+ # print(f"Saving modified file: {shared_strings_path}")
201
+ shared_tree.write(shared_strings_path, encoding='utf-8', xml_declaration=True)
202
+ except Exception as e: print(f"Lỗi lưu {shared_strings_path}: {e}"); success = False
203
+
204
+ if worksheets_folder and os.path.exists(worksheets_folder):
205
+ for sheet_filename, sheet_tree in sheet_trees.items():
206
+ sheet_relative_path = os.path.join("xl", "worksheets", sheet_filename)
207
+ if sheet_relative_path in updated_files:
208
+ sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
209
+ try:
210
+ # print(f"Saving modified file: {sheet_file_path}")
211
+ sheet_tree.write(sheet_file_path, encoding='utf-8', xml_declaration=True)
212
+ except Exception as e: print(f"Lỗi lưu {sheet_file_path}: {e}"); success = False
213
+
214
+ if success and updated_files: print(f"Đã lưu thành công {len(updated_files)} file XML đã sửa đổi (đã giữ lại định dạng đầu tiên cho Rich Text).")
215
+ elif not updated_files: print("Không có file XML nào cần cập nhật.") ; return True
216
+ return success
217
+
218
+ def zip_folder_to_excel_file(folder_path, file_name):
219
+ try:
220
+ # Nén thư mục thành file .xlsx trong RAM
221
+ xlsx_buffer = io.BytesIO()
222
+ with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
223
+ for root, _, files in os.walk(folder_path):
224
+ for file in files:
225
+ file_path = os.path.join(root, file)
226
+ archive_path = os.path.relpath(file_path, folder_path)
227
+ zipf.write(file_path, archive_path)
228
+
229
+ xlsx_buffer.seek(0)
230
+
231
+ client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
232
+ db = client['excel']
233
+ fs = gridfs.GridFS(db, collection='final_file')
234
+
235
+ file_id = fs.put(xlsx_buffer.read(), filename=file_name)
236
+ print(f"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}")
237
+ return file_id
238
+
239
+ except Exception as e:
240
+ print(f"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}")
241
+ return None
242
+
243
+
244
+ def get_text_list_from_nodes(modifiable_nodes: Optional[List[Dict[str, Any]]]) -> List[str]:
245
+ if modifiable_nodes is None:
246
+ return [] # Trả về list rỗng nếu đầu vào là None
247
+
248
+ # Sử dụng list comprehension để lấy giá trị của key 'original_text' từ mỗi dictionary
249
+ text_list = [
250
+ node_info['original_text']
251
+ for node_info in modifiable_nodes
252
+ if 'original_text' in node_info and node_info['original_text'] is not None
253
+ ]
254
+ # Thêm kiểm tra 'original_text' tồn tại và không phải None cho chắc chắn
255
+
256
+ return text_list
257
+
258
+
259
+ def count_words(text: str) -> int:
260
+ """Đếm số từ trong một chuỗi bằng cách tách theo khoảng trắng."""
261
+ if not text or text.isspace():
262
+ return 0
263
+ return len(text.split())
264
+
265
+ # Helper function to process a batch of valid segments (Unchanged)
266
+ def _translate_batch_helper(segments_to_translate, original_indices_1based, source_lang, target_lang):
267
+ """Handles preprocessing, translation, postprocessing, and error handling for a batch."""
268
+ batch_results = [None] * len(segments_to_translate)
269
+
270
+ if not segments_to_translate:
271
+ return []
272
+
273
+ try:
274
+ processed_segments = preprocess_text(segments_to_translate)
275
+ translated_segments = translate_text(processed_segments, source_lang, target_lang)
276
+ final_translated_segments = postprocess_text(translated_segments)
277
+
278
+ if len(final_translated_segments) == len(segments_to_translate):
279
+ batch_results = final_translated_segments
280
+ else:
281
+ print(f" *** CRITICAL ERROR: Batch translation result count mismatch! Expected {len(segments_to_translate)}, got {len(final_translated_segments)}. Marking batch as failed.")
282
+ error_msg = "<translation_length_mismatch_error>"
283
+ batch_results = [error_msg] * len(segments_to_translate)
284
+
285
+ except Exception as e:
286
+ print(f" *** ERROR during batch translation: {e}. Marking batch as failed.")
287
+ # traceback.print_exc() # Uncomment for detailed debug
288
+ error_msg = "<translation_api_error>"
289
+ batch_results = [error_msg] * len(segments_to_translate)
290
+
291
+ return batch_results
292
+
293
+
294
+ def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):
295
+ """
296
+ Dịch file XLSX, chia thành batch động, dịch riêng các segment quá dài.
297
+
298
+ Args:
299
+ input_filepath (str): Đường dẫn đến file XLSX đầu vào.
300
+ output_filepath (str): Đường dẫn để lưu file XLSX đã dịch.
301
+ source_lang (str): Mã ngôn ngữ nguồn.
302
+ target_lang (str): Mã ngôn ngữ đích.
303
+ batch_size_segments (int): Số lượng đoạn text tối đa MONG MUỐN trong mỗi lần gọi API.
304
+ max_words_per_segment (int): Giới hạn từ tối đa cho một segment để được dịch theo batch.
305
+ Các segment dài hơn sẽ được dịch riêng lẻ.
306
+ delay_between_requests (int): Thời gian chờ (giây) giữa các lần gọi API dịch.
307
+ """
308
+ client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
309
+ db = client['excel']
310
+ fs = gridfs.GridFS(db, collection='root_file')
311
+
312
+ ppt_file = fs.get(file_id)
313
+ excel_file = BytesIO(ppt_file.read())
314
+
315
+ xml_folder = unzip_office_file(excel_file)
316
+
317
+ modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)
318
+
319
+ original_texts = get_text_list_from_nodes(modifiable_nodes)
320
+
321
+ all_results = [None] * len(original_texts)
322
+ current_index = 0
323
+ processed_count = 0
324
+ api_call_counter = 0 # Track API calls for delay logic
325
+
326
+ while current_index < len(original_texts):
327
+ batch_texts_to_translate = []
328
+ batch_original_indices = [] # 0-based indices for assignment
329
+ batch_end_index = min(current_index + batch_size_segments, len(original_texts))
330
+ found_long_segment_at = -1 # 0-based index in original_texts
331
+
332
+ # 1. Build the next potential batch, stopping if a long segment is found
333
+ for i in range(current_index, batch_end_index):
334
+ segment = original_texts[i]
335
+ word_count = count_words(segment)
336
+
337
+ if word_count <= max_words_per_segment:
338
+ batch_texts_to_translate.append(segment)
339
+ batch_original_indices.append(i)
340
+ else:
341
+ found_long_segment_at = i
342
+ break # Stop building this batch
343
+
344
+ # --- Process the findings ---
345
+
346
+ # 2. Translate the VALID batch collected *before* the long segment (if any)
347
+ if batch_texts_to_translate:
348
+ # Add delay BEFORE the API call if it's not the very first call
349
+ if api_call_counter > 0 and delay_between_requests > 0:
350
+ time.sleep(delay_between_requests)
351
+
352
+ translated_batch = _translate_batch_helper(
353
+ batch_texts_to_translate,
354
+ [idx + 1 for idx in batch_original_indices], # 1-based for logging
355
+ source_lang,
356
+ target_lang
357
+ )
358
+ api_call_counter += 1
359
+ # Assign results back
360
+ for batch_idx, original_idx in enumerate(batch_original_indices):
361
+ all_results[original_idx] = translated_batch[batch_idx]
362
+ processed_count += len(batch_texts_to_translate)
363
+
364
+ # 3. Handle the long segment INDIVIDUALLY (if one was found)
365
+ if found_long_segment_at != -1:
366
+ long_segment_index = found_long_segment_at
367
+ long_segment_text = str(original_texts[long_segment_index])
368
+ # word_count = count_words(long_segment_text) # Recalculate for log clarity
369
+
370
+ try:
371
+ translated = translate_single_text(long_segment_text, source_lang, target_lang)
372
+
373
+ final = [translated]
374
+ api_call_counter += 1
375
+
376
+ if len(final) == 1:
377
+ all_results[long_segment_index] = final[0]
378
+ else:
379
+ print(f" *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.")
380
+ all_results[long_segment_index] = "<translation_length_mismatch_error>"
381
+
382
+ except Exception as e:
383
+ print(f" *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.")
384
+ # traceback.print_exc() # Uncomment for detailed debug
385
+ all_results[long_segment_index] = "<translation_api_error>"
386
+ # Do not increment api_call_counter if the API call itself failed before returning
387
+
388
+ processed_count += 1
389
+ # Update current_index to start AFTER this long segment
390
+ current_index = long_segment_index + 1
391
+
392
+ else:
393
+ # No long segment was found in the range checked.
394
+ # Move current_index to the end of the range examined.
395
+ current_index = batch_end_index
396
+
397
+ missing_count = 0
398
+ final_texts_for_nodes = []
399
+ for i, res in enumerate(all_results):
400
+ if res is None:
401
+ print(f"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'")
402
+ final_texts_for_nodes.append(original_texts[i])
403
+ missing_count += 1
404
+ else:
405
+ final_texts_for_nodes.append(res)
406
+
407
+ if missing_count > 0:
408
+ print(f"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.")
409
+
410
+ if len(final_texts_for_nodes) != len(original_texts):
411
+ print(f"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.")
412
+ else:
413
+ # Gán vào node
414
+ for i, node_info in enumerate(modifiable_nodes):
415
+ node_info['modified_text'] = final_texts_for_nodes[i]
416
+
417
+ save_success = apply_and_save_changes(modifiable_nodes, global_data)
418
+ if not save_success:
419
+ print("LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.")
420
+ else:
421
+ # Only zip if saving XML was successful
422
+ final_id = zip_folder_to_excel_file(xml_folder, file_name)
423
+ if final_id:
424
+ shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping
425
+ else:
426
+ print("LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.")
427
+ return final_id
428
+
429
+
430
+
pages/upload.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import google.generativeai as genai
3
  from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
4
  from powerpoint.pptx import translate_pptx
5
- from excel.excel_translate import translate_xlsx, translate_csv
6
  from word.word_translate import translate_docx_from_mongodb
7
  import dotenv
8
  import os
@@ -26,16 +26,11 @@ def process_file(file, file_type):
26
 
27
  if file_type == "PPTX":
28
  final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
29
- # progress_bar.progress(40)
30
- # text_dict = extract_text_from_xml(file_id=xml_file_id)
31
- # translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
32
- # progress_bar.progress(60)
33
- # final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
34
- # final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
35
  elif file_type == "Excel":
36
- final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
37
- elif file_type == "CSV":
38
- final_id = translate_csv(file_id = file_id, target_lang = target_lang)
39
  elif file_type == "Word":
40
  final_id = translate_docx_from_mongodb(file_id, target_lang)
41
  else:
 
2
  import google.generativeai as genai
3
  from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
4
  from powerpoint.pptx import translate_pptx
5
+ from excel.xlsx import translate_xlsx
6
  from word.word_translate import translate_docx_from_mongodb
7
  import dotenv
8
  import os
 
26
 
27
  if file_type == "PPTX":
28
  final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
29
+ progress_bar.progress(60)
 
 
 
 
 
30
  elif file_type == "Excel":
31
+ final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
32
+ # elif file_type == "CSV":
33
+ # final_id = translate_csv(file_id = file_id, target_lang = target_lang)
34
  elif file_type == "Word":
35
  final_id = translate_docx_from_mongodb(file_id, target_lang)
36
  else:
powerpoint/__pycache__/pptx.cpython-310.pyc CHANGED
Binary files a/powerpoint/__pycache__/pptx.cpython-310.pyc and b/powerpoint/__pycache__/pptx.cpython-310.pyc differ
 
powerpoint/pptx.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
  import zipfile
3
  import shutil
4
- from pptx import Presentation
5
  from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
6
  from powerpoint.xml_handling import *
7
  from pymongo import MongoClient
8
  import gridfs
9
- from bson import ObjectId
10
  from io import BytesIO
11
 
12
  def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
 
1
  import os
2
  import zipfile
3
  import shutil
 
4
  from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
5
  from powerpoint.xml_handling import *
6
  from pymongo import MongoClient
7
  import gridfs
 
8
  from io import BytesIO
9
 
10
  def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
powerpoint/pptx_object.py DELETED
@@ -1,354 +0,0 @@
1
- # ppt_objects.py
2
- from pptx import Presentation
3
- from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
4
- from pptx.enum.shapes import MSO_SHAPE_TYPE
5
- import xml.etree.ElementTree as ET
6
- from pptx.util import Pt
7
- from pptx.dml.color import RGBColor
8
- import re
9
- import json
10
-
11
- from pymongo import MongoClient
12
- from gridfs import GridFS
13
- import json
14
- import xml.etree.ElementTree as ET
15
- from io import BytesIO
16
-
17
-
18
- def apply_group_properties_recursive(shape, shape_index, parent_element):
19
- """Recursively applies properties to shapes within groups."""
20
- if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
21
- group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
22
- if group_element is not None:
23
- for i, sub_shape in enumerate(shape.shapes):
24
- apply_group_properties_recursive(sub_shape, i, group_element)
25
-
26
- # Apply properties for sub-shapes WITHIN the group, based on their type.
27
- if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
28
- table_element = group_element.find(f".//table_element[@shape_index='{i}']")
29
- if table_element: # Use a shorter name for clarity
30
- props_element = table_element.find("properties")
31
- if props_element is not None and props_element.text:
32
- try:
33
- table_data = json.loads(props_element.text)
34
- apply_table_properties(sub_shape.table, table_data)
35
- except (json.JSONDecodeError, KeyError) as e:
36
- print(f"Error applying table properties (in group): {str(e)}")
37
-
38
- elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
39
- text_element = group_element.find(f".//text_element[@shape_index='{i}']")
40
- if text_element: # Shorter name
41
- props_element = text_element.find("properties")
42
- if props_element is not None and props_element.text:
43
- try:
44
- shape_data = json.loads(props_element.text)
45
- apply_shape_properties(sub_shape, shape_data)
46
- except (json.JSONDecodeError, KeyError) as e:
47
- print(f"Error applying shape properties (in group): {str(e)}")
48
-
49
- def get_alignment_value(alignment_str):
50
- """Convert alignment string (with extra characters) to PP_ALIGN enum value."""
51
- alignment_map = {
52
- 'center': PP_ALIGN.CENTER,
53
- 'left': PP_ALIGN.LEFT,
54
- 'right': PP_ALIGN.RIGHT,
55
- 'justify': PP_ALIGN.JUSTIFY
56
- }
57
- match = re.match(r"([A-Za-z]+)", alignment_str)
58
- return alignment_map.get(match.group(1).lower()) if match else None
59
-
60
- def get_vertical_anchor(value):
61
- """Converts vertical_anchor string to MSO_ANCHOR enum."""
62
- mapping = {
63
- "TOP": MSO_ANCHOR.TOP,
64
- "MIDDLE": MSO_ANCHOR.MIDDLE,
65
- "BOTTOM": MSO_ANCHOR.BOTTOM
66
- }
67
- return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
68
-
69
- def get_table_properties(table):
70
- """Extract complete table properties."""
71
- table_data = {
72
- 'rows': len(table.rows),
73
- 'cols': len(table.columns),
74
- 'cells': []
75
- }
76
- for row in table.rows:
77
- row_data = []
78
- for cell in row.cells:
79
- cell_data = {
80
- 'text': cell.text.strip(),
81
- 'font_size': None,
82
- 'font_name': None,
83
- 'alignment': None,
84
- 'margin_left': cell.margin_left,
85
- 'margin_right': cell.margin_right,
86
- 'margin_top': cell.margin_top,
87
- 'margin_bottom': cell.margin_bottom,
88
- 'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
89
- 'font_color': None
90
- }
91
- if cell.text_frame.paragraphs:
92
- paragraph = cell.text_frame.paragraphs[0]
93
- if paragraph.runs:
94
- run = paragraph.runs[0]
95
- if hasattr(run.font, 'size') and run.font.size is not None:
96
- cell_data['font_size'] = run.font.size.pt
97
- if hasattr(run.font, 'name'):
98
- cell_data['font_name'] = run.font.name
99
- if hasattr(run.font, 'bold'):
100
- cell_data['bold'] = run.font.bold
101
- if hasattr(run.font, 'italic'):
102
- cell_data['italic'] = run.font.italic
103
- if (hasattr(run.font, 'color') and
104
- run.font.color is not None and
105
- hasattr(run.font.color, 'rgb') and
106
- run.font.color.rgb is not None):
107
- cell_data['font_color'] = str(run.font.color.rgb)
108
- if hasattr(paragraph, 'alignment'):
109
- cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
110
- row_data.append(cell_data)
111
- table_data['cells'].append(row_data)
112
- return table_data
113
-
114
- def get_shape_properties(shape):
115
- """Extract all properties from a shape, with detailed debug prints."""
116
- shape_data = {
117
- 'text': '',
118
- 'font_size': None,
119
- 'font_name': None,
120
- 'alignment': None,
121
- 'width': shape.width,
122
- 'height': shape.height,
123
- 'left': shape.left,
124
- 'top': shape.top,
125
- 'bold': None,
126
- 'italic': None,
127
- 'line_spacing_info': {
128
- 'rule': None,
129
- 'value': None
130
- },
131
- 'space_before': None,
132
- 'space_after': None,
133
- 'font_color': None
134
- }
135
-
136
- if hasattr(shape, "text"):
137
- shape_data['text'] = shape.text.strip()
138
- if hasattr(shape, 'text_frame'):
139
- for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
140
- if paragraph.runs:
141
- run = paragraph.runs[0] # Assuming properties are mostly consistent in the first run
142
- if hasattr(run.font, 'size') and run.font.size is not None:
143
- shape_data['font_size'] = run.font.size.pt
144
- if hasattr(run.font, 'name'):
145
- shape_data['font_name'] = run.font.name
146
- if hasattr(run.font, 'bold'):
147
- shape_data['bold'] = run.font.bold
148
- if hasattr(run.font, 'italic'):
149
- shape_data['italic'] = run.font.italic
150
- if (hasattr(run.font, 'color') and
151
- run.font.color is not None and
152
- hasattr(run.font.color, 'rgb') and
153
- run.font.color.rgb is not None):
154
- shape_data['font_color'] = str(run.font.color.rgb)
155
-
156
- if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
157
- shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
158
- if hasattr(paragraph, 'space_before'):
159
- shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
160
- if hasattr(paragraph, 'space_after'):
161
- shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
162
-
163
- if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
164
- line_spacing = paragraph.line_spacing
165
-
166
- # Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
167
- if isinstance(line_spacing, Pt) or line_spacing > 10:
168
- line_spacing_rule = "EXACTLY"
169
- elif isinstance(line_spacing, float):
170
- line_spacing_rule = "MULTIPLE"
171
- else:
172
- line_spacing_rule = "UNKNOWN"
173
-
174
- shape_data['line_spacing_info'] = {
175
- 'rule': line_spacing_rule,
176
- 'value': line_spacing if isinstance(line_spacing, float) else None
177
- }
178
-
179
- return shape_data
180
-
181
- def apply_shape_properties(shape, shape_data):
182
- """Apply saved properties to a shape."""
183
- try:
184
- shape.width = shape_data['width']
185
- shape.height = shape_data['height']
186
- shape.left = shape_data['left']
187
- shape.top = shape_data['top']
188
- shape.text = ""
189
- paragraph = shape.text_frame.paragraphs[0]
190
- run = paragraph.add_run()
191
- run.text = shape_data['text']
192
- if shape_data['font_size']:
193
- adjusted_size = shape_data['font_size'] * 0.9
194
- run.font.size = Pt(adjusted_size)
195
-
196
- if shape_data.get('font_name'):
197
- run.font.name = shape_data['font_name']
198
- else:
199
- run.font.name = "Arial"
200
- if shape_data.get('font_color'):
201
- run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
202
- if shape_data['bold'] is not None:
203
- run.font.bold = shape_data['bold']
204
- if shape_data['italic'] is not None:
205
- run.font.italic = shape_data['italic']
206
- if shape_data['alignment']:
207
- paragraph.alignment = get_alignment_value(shape_data['alignment'])
208
-
209
- line_spacing_info = shape_data.get('line_spacing_info', {})
210
- line_spacing_rule = line_spacing_info.get('rule')
211
- line_spacing_value = line_spacing_info.get('value')
212
-
213
- if line_spacing_rule and line_spacing_value is not None:
214
- if line_spacing_rule == "EXACTLY":
215
- paragraph.line_spacing = Pt(line_spacing_value)
216
- elif line_spacing_rule == "AT_LEAST":
217
- paragraph.line_spacing = Pt(line_spacing_value)
218
- elif line_spacing_rule == "MULTIPLE":
219
- paragraph.line_spacing = line_spacing_value
220
- else:
221
- print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
222
-
223
- if shape_data['space_before']:
224
- paragraph.space_before = shape_data['space_before']
225
- if shape_data['space_after']:
226
- paragraph.space_after = shape_data['space_after']
227
-
228
-
229
- except Exception as e:
230
- print(f"Error applying shape properties: {str(e)}")
231
-
232
-
233
- def apply_table_properties(table, table_data):
234
- """Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
235
- for row_idx, row in enumerate(table.rows):
236
- for col_idx, cell in enumerate(row.cells):
237
- try:
238
- cell_data = table_data['cells'][row_idx][col_idx]
239
-
240
- # Áp dụng margin
241
- cell.margin_left = cell_data.get('margin_left', 0)
242
- cell.margin_right = cell_data.get('margin_right', 0)
243
- cell.margin_top = cell_data.get('margin_top', 0)
244
- cell.margin_bottom = cell_data.get('margin_bottom', 0)
245
-
246
- # Áp dụng vertical_anchor (tránh dùng eval)
247
- if 'vertical_anchor' in cell_data:
248
- cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
249
-
250
- # Xóa nội dung cũ và thiết lập văn bản mới
251
- cell.text = ""
252
- paragraph = cell.text_frame.paragraphs[0]
253
- run = paragraph.add_run()
254
- run.text = cell_data.get('text', "")
255
-
256
- # Thiết lập kích thước font
257
- if 'font_size' in cell_data:
258
- adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
259
- run.font.size = Pt(adjusted_size)
260
-
261
- # Thiết lập font chữ
262
- run.font.name = cell_data.get('font_name', 'Arial')
263
-
264
- # Màu chữ
265
- if 'font_color' in cell_data:
266
- run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
267
-
268
- # In đậm & in nghiêng
269
- run.font.bold = cell_data.get('bold', False)
270
- run.font.italic = cell_data.get('italic', False)
271
-
272
- # Căn lề văn bản
273
- if 'alignment' in cell_data:
274
- paragraph.alignment = get_alignment_value(cell_data['alignment'])
275
-
276
- except Exception as e:
277
- print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
278
-
279
-
280
- def get_file_from_mongodb(db_name, collection_name, file_id):
281
- """Tải tệp từ MongoDB GridFS"""
282
- client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
283
- db = client[db_name]
284
- fs = GridFS(db, collection_name)
285
- file_data = fs.get(file_id)
286
- return file_data
287
- # return BytesIO(file_data.read())
288
-
289
-
290
- def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
291
- """Lưu tệp vào MongoDB GridFS"""
292
- client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
293
- db = client[db_name]
294
- fs = GridFS(db, collection_name)
295
- file_id = fs.put(file_data, filename=file_name)
296
- client.close()
297
- return file_id
298
-
299
- def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
300
- """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
301
- try:
302
- # Kết nối MongoDB và tải file
303
- original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
304
- translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
305
-
306
- # Load PowerPoint gốc và XML dịch
307
- prs = Presentation(BytesIO(original_ppt.read()))
308
- tree = ET.parse(BytesIO(translated_xml.read()))
309
- root = tree.getroot()
310
-
311
- # Áp dụng bản dịch
312
- for slide_number, slide in enumerate(prs.slides, 1):
313
- xml_slide = root.find(f".//slide[@number='{slide_number}']")
314
- if xml_slide is None:
315
- continue
316
- for shape_index, shape in enumerate(slide.shapes):
317
- if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
318
- apply_group_properties_recursive(shape, shape_index, xml_slide)
319
- elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
320
- table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
321
- if table_element is not None:
322
- props_element = table_element.find("properties")
323
- if props_element is not None and props_element.text:
324
- try:
325
- table_data = json.loads(props_element.text)
326
- apply_table_properties(shape.table, table_data)
327
- except Exception as e:
328
- print(f"Error applying table properties: {str(e)}")
329
- elif hasattr(shape, "text"):
330
- text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
331
- if text_element is not None:
332
- props_element = text_element.find("properties")
333
- if props_element is not None and props_element.text:
334
- try:
335
- shape_data = json.loads(props_element.text)
336
- apply_shape_properties(shape, shape_data)
337
- except Exception as e:
338
- print(f"Error applying shape properties: {str(e)}")
339
-
340
- # Lưu PowerPoint vào MongoDB với tên gốc
341
- output_io = BytesIO()
342
- prs.save(output_io)
343
- output_io.seek(0) # Reset vị trí đọc
344
-
345
- # Giữ nguyên tên file gốc, thêm hậu tố "_translated"
346
- translated_filename = original_ppt.filename.replace(".xml", ".pptx")
347
-
348
- file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
349
- print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
350
-
351
- return file_id
352
- except Exception as e:
353
- print(f"Error creating translated PowerPoint: {str(e)}")
354
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
powerpoint/pptx_processor.py DELETED
@@ -1,50 +0,0 @@
1
- # ppt_processor.py
2
- from pathlib import Path
3
- from xml_handling import ppt_to_xml, translate_xml_file
4
- from pptx_object import create_translated_ppt
5
- import os
6
-
7
- def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
8
- """Process a single PPT/PPTX file from XML extraction to final translation."""
9
- ppt_path = ppt_path.strip("'\"")
10
- ppt_path = ppt_path.replace("\\ ", " ")
11
- ppt_path = ppt_path.replace("\\'", "'")
12
- ppt_path = os.path.expanduser(ppt_path)
13
- ppt_path = Path(ppt_path).resolve()
14
- # chuyển thành link DB trên server
15
- try:
16
- if not ppt_path.is_file():
17
- print(f"Error: '{ppt_path}' is not a valid file.")
18
- return
19
- if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
20
- print(f"Error: '{ppt_path}' is not a PowerPoint file.")
21
- return
22
-
23
- base_dir = ppt_path.parent
24
-
25
- # Original XML
26
- print(f"Generating original XML for {ppt_path.name}...")
27
- original_xml = ppt_to_xml(str(ppt_path))
28
- if original_xml:
29
- original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
30
- with open(original_output_path, 'w', encoding='utf-8') as f:
31
- f.write(original_xml)
32
- print(f"Original XML saved: {original_output_path}")
33
-
34
- # Save original XML to MongoDB
35
- # save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
36
-
37
- # Translated XML
38
- print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
39
- translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
40
- original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
41
- translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
42
-
43
- # Create Translated PPT
44
- print(f"Creating translated PPT for {ppt_path.name}...")
45
- output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
46
- output_ppt_path = base_dir / output_filename
47
- create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
48
-
49
- except Exception as e:
50
- print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")