Spaces:
Sleeping
Sleeping
update xlsx
Browse files- excel/__pycache__/xlsx.cpython-310.pyc +0 -0
- excel/xlsx.py +430 -0
- pages/upload.py +5 -10
- powerpoint/__pycache__/pptx.cpython-310.pyc +0 -0
- powerpoint/pptx.py +0 -2
- powerpoint/pptx_object.py +0 -354
- powerpoint/pptx_processor.py +0 -50
excel/__pycache__/xlsx.cpython-310.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
excel/xlsx.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
import copy
|
| 4 |
+
import time
|
| 5 |
+
import xml.etree.ElementTree as ET
|
| 6 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 7 |
+
from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text
|
| 8 |
+
from pymongo import MongoClient
|
| 9 |
+
import gridfs
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
import shutil
|
| 12 |
+
import io
|
| 13 |
+
|
| 14 |
+
NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
| 15 |
+
|
| 16 |
+
# --- Hàm đăng ký namespace (quan trọng khi ghi file) ---
|
| 17 |
+
def register_namespaces(xml_file):
|
| 18 |
+
"""Đọc và đăng ký các namespace từ file XML."""
|
| 19 |
+
namespaces = dict([
|
| 20 |
+
node for _, node in ET.iterparse(xml_file, events=['start-ns'])
|
| 21 |
+
])
|
| 22 |
+
for ns, uri in namespaces.items():
|
| 23 |
+
ET.register_namespace(ns, uri)
|
| 24 |
+
# Đăng ký thêm namespace phổ biến nếu chưa có
|
| 25 |
+
if 'main' not in namespaces and '' not in namespaces: # Kiểm tra cả prefix rỗng
|
| 26 |
+
ET.register_namespace('', NS_MAIN['main']) # Đăng ký default namespace
|
| 27 |
+
elif 'main' not in namespaces:
|
| 28 |
+
ET.register_namespace('main', NS_MAIN['main']) # Đăng ký với prefix 'main'
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:
|
| 32 |
+
"""
|
| 33 |
+
Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text.
|
| 34 |
+
"""
|
| 35 |
+
modifiable_nodes = []
|
| 36 |
+
shared_strings_path = os.path.join(unzipped_folder_path, "xl", "sharedStrings.xml")
|
| 37 |
+
worksheets_folder = os.path.join(unzipped_folder_path, "xl", "worksheets")
|
| 38 |
+
shared_tree = None
|
| 39 |
+
sheet_trees = {}
|
| 40 |
+
|
| 41 |
+
# --- Xử lý sharedStrings.xml ---
|
| 42 |
+
if os.path.exists(shared_strings_path):
|
| 43 |
+
try:
|
| 44 |
+
register_namespaces(shared_strings_path)
|
| 45 |
+
shared_tree = ET.parse(shared_strings_path)
|
| 46 |
+
root_shared = shared_tree.getroot()
|
| 47 |
+
|
| 48 |
+
for si_element in root_shared.findall('main:si', NS_MAIN):
|
| 49 |
+
text_parts = []
|
| 50 |
+
t_elements = si_element.findall('.//main:t', NS_MAIN) # Tìm tất cả <t> con
|
| 51 |
+
|
| 52 |
+
# Tìm run đầu tiên (<r>) và properties (<rPr>) của nó
|
| 53 |
+
first_r = si_element.find('./main:r', NS_MAIN) # Tìm <r> con trực tiếp đầu tiên
|
| 54 |
+
first_rpr_clone = None # Lưu bản sao của <rPr> đầu tiên
|
| 55 |
+
is_rich_text = first_r is not None
|
| 56 |
+
|
| 57 |
+
if is_rich_text:
|
| 58 |
+
# Tìm <rPr> bên trong <r> đầu tiên
|
| 59 |
+
first_rpr = first_r.find('./main:rPr', NS_MAIN)
|
| 60 |
+
if first_rpr is not None:
|
| 61 |
+
# Sao chép sâu để không ảnh hưởng cây gốc và để dùng sau
|
| 62 |
+
first_rpr_clone = copy.deepcopy(first_rpr)
|
| 63 |
+
|
| 64 |
+
# Lấy toàn bộ text
|
| 65 |
+
for t_node in t_elements:
|
| 66 |
+
if t_node.text:
|
| 67 |
+
text_parts.append(t_node.text)
|
| 68 |
+
full_text = "".join(text_parts)
|
| 69 |
+
|
| 70 |
+
if not full_text: continue # Bỏ qua nếu không có text
|
| 71 |
+
|
| 72 |
+
if is_rich_text:
|
| 73 |
+
modifiable_nodes.append({
|
| 74 |
+
'type': 'shared_rich',
|
| 75 |
+
'original_text': full_text,
|
| 76 |
+
'element': si_element, # Tham chiếu <si>
|
| 77 |
+
'first_format': first_rpr_clone, # Lưu định dạng <rPr> đầu tiên (hoặc None)
|
| 78 |
+
'source_file': os.path.join("xl", "sharedStrings.xml"),
|
| 79 |
+
'sheet_name': None
|
| 80 |
+
})
|
| 81 |
+
elif t_elements: # Không phải rich text, tìm thẻ <t> đơn giản
|
| 82 |
+
first_t = si_element.find('./main:t', NS_MAIN)
|
| 83 |
+
if first_t is not None:
|
| 84 |
+
modifiable_nodes.append({
|
| 85 |
+
'type': 'shared_simple',
|
| 86 |
+
'original_text': full_text,
|
| 87 |
+
'element': first_t, # Tham chiếu <t>
|
| 88 |
+
'first_format': None, # Không có định dạng đặc biệt
|
| 89 |
+
'source_file': os.path.join("xl", "sharedStrings.xml"),
|
| 90 |
+
'sheet_name': None
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"Lỗi xử lý sharedStrings: {e}")
|
| 95 |
+
import traceback
|
| 96 |
+
traceback.print_exc()
|
| 97 |
+
|
| 98 |
+
# --- Xử lý các file sheetX.xml (Inline Strings - không có định dạng phức tạp) ---
|
| 99 |
+
if os.path.isdir(worksheets_folder):
|
| 100 |
+
for sheet_filename in sorted(os.listdir(worksheets_folder)):
|
| 101 |
+
if sheet_filename.lower().endswith(".xml"):
|
| 102 |
+
# ... (phần đọc và parse sheet tree như cũ) ...
|
| 103 |
+
sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
|
| 104 |
+
try:
|
| 105 |
+
register_namespaces(sheet_file_path)
|
| 106 |
+
sheet_tree = ET.parse(sheet_file_path)
|
| 107 |
+
sheet_trees[sheet_filename] = sheet_tree
|
| 108 |
+
root_sheet = sheet_tree.getroot()
|
| 109 |
+
for cell in root_sheet.findall('.//main:c[@t="inlineStr"]', NS_MAIN):
|
| 110 |
+
t_element = cell.find('.//main:is/main:t', NS_MAIN)
|
| 111 |
+
if t_element is not None and t_element.text is not None:
|
| 112 |
+
modifiable_nodes.append({
|
| 113 |
+
'type': 'inline',
|
| 114 |
+
'original_text': t_element.text,
|
| 115 |
+
'element': t_element, # Tham chiếu <t>
|
| 116 |
+
'first_format': None, # Inline string không có định dạng <rPr>
|
| 117 |
+
'source_file': os.path.join("xl", "worksheets", sheet_filename),
|
| 118 |
+
'sheet_name': sheet_filename
|
| 119 |
+
})
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Lỗi xử lý sheet {sheet_filename}: {e}")
|
| 122 |
+
import traceback
|
| 123 |
+
traceback.print_exc()
|
| 124 |
+
|
| 125 |
+
else:
|
| 126 |
+
print(f"Lỗi: Không tìm thấy thư mục worksheets: {worksheets_folder}")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
global_data = {"shared_tree": shared_tree, "sheet_trees": sheet_trees, "shared_strings_path": shared_strings_path, "worksheets_folder": worksheets_folder}
|
| 130 |
+
return modifiable_nodes, global_data
|
| 131 |
+
|
| 132 |
+
def apply_and_save_changes(modified_nodes_data: List[Dict[str, Any]], global_data: Dict[str, Any]) -> bool:
|
| 133 |
+
"""
|
| 134 |
+
Cập nhật text, giữ lại định dạng đầu tiên cho Rich Text, và lưu file XML.
|
| 135 |
+
"""
|
| 136 |
+
if not global_data: print("Lỗi: Thiếu global_data."); return False
|
| 137 |
+
|
| 138 |
+
updated_files = set()
|
| 139 |
+
try: ET.register_namespace('xml', "http://www.w3.org/XML/1998/namespace")
|
| 140 |
+
except ValueError: pass
|
| 141 |
+
|
| 142 |
+
for node_info in modified_nodes_data:
|
| 143 |
+
if 'modified_text' in node_info and node_info['element'] is not None:
|
| 144 |
+
element = node_info['element']
|
| 145 |
+
modified_text = node_info['modified_text']
|
| 146 |
+
original_text = node_info.get('original_text', '')
|
| 147 |
+
node_type = node_info.get('type', '')
|
| 148 |
+
first_format = node_info.get('first_format') # Lấy <rPr> đã lưu (hoặc None)
|
| 149 |
+
|
| 150 |
+
if original_text != modified_text:
|
| 151 |
+
# --- Xử lý Rich Text: Tạo lại cấu trúc <si><r>[<rPr>]<t></r></si> ---
|
| 152 |
+
if node_type == 'shared_rich':
|
| 153 |
+
si_element = element
|
| 154 |
+
# Xóa con cũ
|
| 155 |
+
for child in list(si_element):
|
| 156 |
+
si_element.remove(child)
|
| 157 |
+
|
| 158 |
+
# Tạo run mới <r>
|
| 159 |
+
new_r = ET.Element(f"{{{NS_MAIN['main']}}}r")
|
| 160 |
+
|
| 161 |
+
# Nếu có định dạng đầu tiên (<rPr>), thêm nó vào <r> mới
|
| 162 |
+
if first_format is not None:
|
| 163 |
+
new_r.append(first_format) # Thêm bản sao <rPr> đã lưu
|
| 164 |
+
|
| 165 |
+
# Tạo thẻ text mới <t>
|
| 166 |
+
new_t = ET.Element(f"{{{NS_MAIN['main']}}}t")
|
| 167 |
+
new_t.text = modified_text
|
| 168 |
+
xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
|
| 169 |
+
new_t.set(xml_space_attr, 'preserve')
|
| 170 |
+
|
| 171 |
+
# Thêm <t> vào <r>
|
| 172 |
+
new_r.append(new_t)
|
| 173 |
+
# Thêm <r> vào <si>
|
| 174 |
+
si_element.append(new_r)
|
| 175 |
+
|
| 176 |
+
updated_files.add(node_info['source_file'])
|
| 177 |
+
# print(f"Applied first format to Rich Text in {node_info['source_file']}")
|
| 178 |
+
|
| 179 |
+
# --- Xử lý Simple/Inline Text: Cập nhật thẻ <t> ---
|
| 180 |
+
elif node_type in ['shared_simple', 'inline']:
|
| 181 |
+
t_element = element
|
| 182 |
+
t_element.text = modified_text
|
| 183 |
+
xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
|
| 184 |
+
if xml_space_attr not in t_element.attrib or t_element.attrib[xml_space_attr] != 'preserve':
|
| 185 |
+
t_element.set(xml_space_attr, 'preserve')
|
| 186 |
+
updated_files.add(node_info['source_file'])
|
| 187 |
+
# print(f"Updated Simple/Inline Text in {node_info['source_file']}")
|
| 188 |
+
else:
|
| 189 |
+
print(f"Cảnh báo: Loại node không xác định '{node_type}'")
|
| 190 |
+
|
| 191 |
+
# --- Lưu lại các file XML đã thay đổi (Giữ nguyên) ---
|
| 192 |
+
success = True
|
| 193 |
+
# ... (Phần code lưu file như cũ) ...
|
| 194 |
+
shared_tree = global_data.get("shared_tree"); shared_strings_path = global_data.get("shared_strings_path")
|
| 195 |
+
sheet_trees = global_data.get("sheet_trees", {}); worksheets_folder = global_data.get("worksheets_folder")
|
| 196 |
+
|
| 197 |
+
shared_strings_relative_path = os.path.join("xl", "sharedStrings.xml")
|
| 198 |
+
if shared_tree and shared_strings_path and shared_strings_relative_path in updated_files:
|
| 199 |
+
try:
|
| 200 |
+
# print(f"Saving modified file: {shared_strings_path}")
|
| 201 |
+
shared_tree.write(shared_strings_path, encoding='utf-8', xml_declaration=True)
|
| 202 |
+
except Exception as e: print(f"Lỗi lưu {shared_strings_path}: {e}"); success = False
|
| 203 |
+
|
| 204 |
+
if worksheets_folder and os.path.exists(worksheets_folder):
|
| 205 |
+
for sheet_filename, sheet_tree in sheet_trees.items():
|
| 206 |
+
sheet_relative_path = os.path.join("xl", "worksheets", sheet_filename)
|
| 207 |
+
if sheet_relative_path in updated_files:
|
| 208 |
+
sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
|
| 209 |
+
try:
|
| 210 |
+
# print(f"Saving modified file: {sheet_file_path}")
|
| 211 |
+
sheet_tree.write(sheet_file_path, encoding='utf-8', xml_declaration=True)
|
| 212 |
+
except Exception as e: print(f"Lỗi lưu {sheet_file_path}: {e}"); success = False
|
| 213 |
+
|
| 214 |
+
if success and updated_files: print(f"Đã lưu thành công {len(updated_files)} file XML đã sửa đổi (đã giữ lại định dạng đầu tiên cho Rich Text).")
|
| 215 |
+
elif not updated_files: print("Không có file XML nào cần cập nhật.") ; return True
|
| 216 |
+
return success
|
| 217 |
+
|
| 218 |
+
def zip_folder_to_excel_file(folder_path, file_name):
|
| 219 |
+
try:
|
| 220 |
+
# Nén thư mục thành file .xlsx trong RAM
|
| 221 |
+
xlsx_buffer = io.BytesIO()
|
| 222 |
+
with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 223 |
+
for root, _, files in os.walk(folder_path):
|
| 224 |
+
for file in files:
|
| 225 |
+
file_path = os.path.join(root, file)
|
| 226 |
+
archive_path = os.path.relpath(file_path, folder_path)
|
| 227 |
+
zipf.write(file_path, archive_path)
|
| 228 |
+
|
| 229 |
+
xlsx_buffer.seek(0)
|
| 230 |
+
|
| 231 |
+
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 232 |
+
db = client['excel']
|
| 233 |
+
fs = gridfs.GridFS(db, collection='final_file')
|
| 234 |
+
|
| 235 |
+
file_id = fs.put(xlsx_buffer.read(), filename=file_name)
|
| 236 |
+
print(f"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}")
|
| 237 |
+
return file_id
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
print(f"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}")
|
| 241 |
+
return None
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def get_text_list_from_nodes(modifiable_nodes: Optional[List[Dict[str, Any]]]) -> List[str]:
|
| 245 |
+
if modifiable_nodes is None:
|
| 246 |
+
return [] # Trả về list rỗng nếu đầu vào là None
|
| 247 |
+
|
| 248 |
+
# Sử dụng list comprehension để lấy giá trị của key 'original_text' từ mỗi dictionary
|
| 249 |
+
text_list = [
|
| 250 |
+
node_info['original_text']
|
| 251 |
+
for node_info in modifiable_nodes
|
| 252 |
+
if 'original_text' in node_info and node_info['original_text'] is not None
|
| 253 |
+
]
|
| 254 |
+
# Thêm kiểm tra 'original_text' tồn tại và không phải None cho chắc chắn
|
| 255 |
+
|
| 256 |
+
return text_list
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def count_words(text: str) -> int:
|
| 260 |
+
"""Đếm số từ trong một chuỗi bằng cách tách theo khoảng trắng."""
|
| 261 |
+
if not text or text.isspace():
|
| 262 |
+
return 0
|
| 263 |
+
return len(text.split())
|
| 264 |
+
|
| 265 |
+
# Helper function to process a batch of valid segments (Unchanged)
|
| 266 |
+
def _translate_batch_helper(segments_to_translate, original_indices_1based, source_lang, target_lang):
|
| 267 |
+
"""Handles preprocessing, translation, postprocessing, and error handling for a batch."""
|
| 268 |
+
batch_results = [None] * len(segments_to_translate)
|
| 269 |
+
|
| 270 |
+
if not segments_to_translate:
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
+
try:
|
| 274 |
+
processed_segments = preprocess_text(segments_to_translate)
|
| 275 |
+
translated_segments = translate_text(processed_segments, source_lang, target_lang)
|
| 276 |
+
final_translated_segments = postprocess_text(translated_segments)
|
| 277 |
+
|
| 278 |
+
if len(final_translated_segments) == len(segments_to_translate):
|
| 279 |
+
batch_results = final_translated_segments
|
| 280 |
+
else:
|
| 281 |
+
print(f" *** CRITICAL ERROR: Batch translation result count mismatch! Expected {len(segments_to_translate)}, got {len(final_translated_segments)}. Marking batch as failed.")
|
| 282 |
+
error_msg = "<translation_length_mismatch_error>"
|
| 283 |
+
batch_results = [error_msg] * len(segments_to_translate)
|
| 284 |
+
|
| 285 |
+
except Exception as e:
|
| 286 |
+
print(f" *** ERROR during batch translation: {e}. Marking batch as failed.")
|
| 287 |
+
# traceback.print_exc() # Uncomment for detailed debug
|
| 288 |
+
error_msg = "<translation_api_error>"
|
| 289 |
+
batch_results = [error_msg] * len(segments_to_translate)
|
| 290 |
+
|
| 291 |
+
return batch_results
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):
|
| 295 |
+
"""
|
| 296 |
+
Dịch file XLSX, chia thành batch động, dịch riêng các segment quá dài.
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
input_filepath (str): Đường dẫn đến file XLSX đầu vào.
|
| 300 |
+
output_filepath (str): Đường dẫn để lưu file XLSX đã dịch.
|
| 301 |
+
source_lang (str): Mã ngôn ngữ nguồn.
|
| 302 |
+
target_lang (str): Mã ngôn ngữ đích.
|
| 303 |
+
batch_size_segments (int): Số lượng đoạn text tối đa MONG MUỐN trong mỗi lần gọi API.
|
| 304 |
+
max_words_per_segment (int): Giới hạn từ tối đa cho một segment để được dịch theo batch.
|
| 305 |
+
Các segment dài hơn sẽ được dịch riêng lẻ.
|
| 306 |
+
delay_between_requests (int): Thời gian chờ (giây) giữa các lần gọi API dịch.
|
| 307 |
+
"""
|
| 308 |
+
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 309 |
+
db = client['excel']
|
| 310 |
+
fs = gridfs.GridFS(db, collection='root_file')
|
| 311 |
+
|
| 312 |
+
ppt_file = fs.get(file_id)
|
| 313 |
+
excel_file = BytesIO(ppt_file.read())
|
| 314 |
+
|
| 315 |
+
xml_folder = unzip_office_file(excel_file)
|
| 316 |
+
|
| 317 |
+
modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)
|
| 318 |
+
|
| 319 |
+
original_texts = get_text_list_from_nodes(modifiable_nodes)
|
| 320 |
+
|
| 321 |
+
all_results = [None] * len(original_texts)
|
| 322 |
+
current_index = 0
|
| 323 |
+
processed_count = 0
|
| 324 |
+
api_call_counter = 0 # Track API calls for delay logic
|
| 325 |
+
|
| 326 |
+
while current_index < len(original_texts):
|
| 327 |
+
batch_texts_to_translate = []
|
| 328 |
+
batch_original_indices = [] # 0-based indices for assignment
|
| 329 |
+
batch_end_index = min(current_index + batch_size_segments, len(original_texts))
|
| 330 |
+
found_long_segment_at = -1 # 0-based index in original_texts
|
| 331 |
+
|
| 332 |
+
# 1. Build the next potential batch, stopping if a long segment is found
|
| 333 |
+
for i in range(current_index, batch_end_index):
|
| 334 |
+
segment = original_texts[i]
|
| 335 |
+
word_count = count_words(segment)
|
| 336 |
+
|
| 337 |
+
if word_count <= max_words_per_segment:
|
| 338 |
+
batch_texts_to_translate.append(segment)
|
| 339 |
+
batch_original_indices.append(i)
|
| 340 |
+
else:
|
| 341 |
+
found_long_segment_at = i
|
| 342 |
+
break # Stop building this batch
|
| 343 |
+
|
| 344 |
+
# --- Process the findings ---
|
| 345 |
+
|
| 346 |
+
# 2. Translate the VALID batch collected *before* the long segment (if any)
|
| 347 |
+
if batch_texts_to_translate:
|
| 348 |
+
# Add delay BEFORE the API call if it's not the very first call
|
| 349 |
+
if api_call_counter > 0 and delay_between_requests > 0:
|
| 350 |
+
time.sleep(delay_between_requests)
|
| 351 |
+
|
| 352 |
+
translated_batch = _translate_batch_helper(
|
| 353 |
+
batch_texts_to_translate,
|
| 354 |
+
[idx + 1 for idx in batch_original_indices], # 1-based for logging
|
| 355 |
+
source_lang,
|
| 356 |
+
target_lang
|
| 357 |
+
)
|
| 358 |
+
api_call_counter += 1
|
| 359 |
+
# Assign results back
|
| 360 |
+
for batch_idx, original_idx in enumerate(batch_original_indices):
|
| 361 |
+
all_results[original_idx] = translated_batch[batch_idx]
|
| 362 |
+
processed_count += len(batch_texts_to_translate)
|
| 363 |
+
|
| 364 |
+
# 3. Handle the long segment INDIVIDUALLY (if one was found)
|
| 365 |
+
if found_long_segment_at != -1:
|
| 366 |
+
long_segment_index = found_long_segment_at
|
| 367 |
+
long_segment_text = str(original_texts[long_segment_index])
|
| 368 |
+
# word_count = count_words(long_segment_text) # Recalculate for log clarity
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
translated = translate_single_text(long_segment_text, source_lang, target_lang)
|
| 372 |
+
|
| 373 |
+
final = [translated]
|
| 374 |
+
api_call_counter += 1
|
| 375 |
+
|
| 376 |
+
if len(final) == 1:
|
| 377 |
+
all_results[long_segment_index] = final[0]
|
| 378 |
+
else:
|
| 379 |
+
print(f" *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.")
|
| 380 |
+
all_results[long_segment_index] = "<translation_length_mismatch_error>"
|
| 381 |
+
|
| 382 |
+
except Exception as e:
|
| 383 |
+
print(f" *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.")
|
| 384 |
+
# traceback.print_exc() # Uncomment for detailed debug
|
| 385 |
+
all_results[long_segment_index] = "<translation_api_error>"
|
| 386 |
+
# Do not increment api_call_counter if the API call itself failed before returning
|
| 387 |
+
|
| 388 |
+
processed_count += 1
|
| 389 |
+
# Update current_index to start AFTER this long segment
|
| 390 |
+
current_index = long_segment_index + 1
|
| 391 |
+
|
| 392 |
+
else:
|
| 393 |
+
# No long segment was found in the range checked.
|
| 394 |
+
# Move current_index to the end of the range examined.
|
| 395 |
+
current_index = batch_end_index
|
| 396 |
+
|
| 397 |
+
missing_count = 0
|
| 398 |
+
final_texts_for_nodes = []
|
| 399 |
+
for i, res in enumerate(all_results):
|
| 400 |
+
if res is None:
|
| 401 |
+
print(f"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'")
|
| 402 |
+
final_texts_for_nodes.append(original_texts[i])
|
| 403 |
+
missing_count += 1
|
| 404 |
+
else:
|
| 405 |
+
final_texts_for_nodes.append(res)
|
| 406 |
+
|
| 407 |
+
if missing_count > 0:
|
| 408 |
+
print(f"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.")
|
| 409 |
+
|
| 410 |
+
if len(final_texts_for_nodes) != len(original_texts):
|
| 411 |
+
print(f"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.")
|
| 412 |
+
else:
|
| 413 |
+
# Gán vào node
|
| 414 |
+
for i, node_info in enumerate(modifiable_nodes):
|
| 415 |
+
node_info['modified_text'] = final_texts_for_nodes[i]
|
| 416 |
+
|
| 417 |
+
save_success = apply_and_save_changes(modifiable_nodes, global_data)
|
| 418 |
+
if not save_success:
|
| 419 |
+
print("LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.")
|
| 420 |
+
else:
|
| 421 |
+
# Only zip if saving XML was successful
|
| 422 |
+
final_id = zip_folder_to_excel_file(xml_folder, file_name)
|
| 423 |
+
if final_id:
|
| 424 |
+
shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping
|
| 425 |
+
else:
|
| 426 |
+
print("LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.")
|
| 427 |
+
return final_id
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
|
pages/upload.py
CHANGED
|
@@ -2,7 +2,7 @@ import streamlit as st
|
|
| 2 |
import google.generativeai as genai
|
| 3 |
from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
|
| 4 |
from powerpoint.pptx import translate_pptx
|
| 5 |
-
from excel.
|
| 6 |
from word.word_translate import translate_docx_from_mongodb
|
| 7 |
import dotenv
|
| 8 |
import os
|
|
@@ -26,16 +26,11 @@ def process_file(file, file_type):
|
|
| 26 |
|
| 27 |
if file_type == "PPTX":
|
| 28 |
final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
|
| 29 |
-
|
| 30 |
-
# text_dict = extract_text_from_xml(file_id=xml_file_id)
|
| 31 |
-
# translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
| 32 |
-
# progress_bar.progress(60)
|
| 33 |
-
# final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
| 34 |
-
# final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
|
| 35 |
elif file_type == "Excel":
|
| 36 |
-
final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
|
| 37 |
-
elif file_type == "CSV":
|
| 38 |
-
|
| 39 |
elif file_type == "Word":
|
| 40 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
| 41 |
else:
|
|
|
|
| 2 |
import google.generativeai as genai
|
| 3 |
from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
|
| 4 |
from powerpoint.pptx import translate_pptx
|
| 5 |
+
from excel.xlsx import translate_xlsx
|
| 6 |
from word.word_translate import translate_docx_from_mongodb
|
| 7 |
import dotenv
|
| 8 |
import os
|
|
|
|
| 26 |
|
| 27 |
if file_type == "PPTX":
|
| 28 |
final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
|
| 29 |
+
progress_bar.progress(60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
elif file_type == "Excel":
|
| 31 |
+
final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
|
| 32 |
+
# elif file_type == "CSV":
|
| 33 |
+
# final_id = translate_csv(file_id = file_id, target_lang = target_lang)
|
| 34 |
elif file_type == "Word":
|
| 35 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
| 36 |
else:
|
powerpoint/__pycache__/pptx.cpython-310.pyc
CHANGED
|
Binary files a/powerpoint/__pycache__/pptx.cpython-310.pyc and b/powerpoint/__pycache__/pptx.cpython-310.pyc differ
|
|
|
powerpoint/pptx.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
import zipfile
|
| 3 |
import shutil
|
| 4 |
-
from pptx import Presentation
|
| 5 |
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
|
| 6 |
from powerpoint.xml_handling import *
|
| 7 |
from pymongo import MongoClient
|
| 8 |
import gridfs
|
| 9 |
-
from bson import ObjectId
|
| 10 |
from io import BytesIO
|
| 11 |
|
| 12 |
def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
|
|
|
|
| 1 |
import os
|
| 2 |
import zipfile
|
| 3 |
import shutil
|
|
|
|
| 4 |
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
|
| 5 |
from powerpoint.xml_handling import *
|
| 6 |
from pymongo import MongoClient
|
| 7 |
import gridfs
|
|
|
|
| 8 |
from io import BytesIO
|
| 9 |
|
| 10 |
def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
|
powerpoint/pptx_object.py
DELETED
|
@@ -1,354 +0,0 @@
|
|
| 1 |
-
# ppt_objects.py
|
| 2 |
-
from pptx import Presentation
|
| 3 |
-
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
|
| 4 |
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
| 5 |
-
import xml.etree.ElementTree as ET
|
| 6 |
-
from pptx.util import Pt
|
| 7 |
-
from pptx.dml.color import RGBColor
|
| 8 |
-
import re
|
| 9 |
-
import json
|
| 10 |
-
|
| 11 |
-
from pymongo import MongoClient
|
| 12 |
-
from gridfs import GridFS
|
| 13 |
-
import json
|
| 14 |
-
import xml.etree.ElementTree as ET
|
| 15 |
-
from io import BytesIO
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def apply_group_properties_recursive(shape, shape_index, parent_element):
|
| 19 |
-
"""Recursively applies properties to shapes within groups."""
|
| 20 |
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
| 21 |
-
group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
|
| 22 |
-
if group_element is not None:
|
| 23 |
-
for i, sub_shape in enumerate(shape.shapes):
|
| 24 |
-
apply_group_properties_recursive(sub_shape, i, group_element)
|
| 25 |
-
|
| 26 |
-
# Apply properties for sub-shapes WITHIN the group, based on their type.
|
| 27 |
-
if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
| 28 |
-
table_element = group_element.find(f".//table_element[@shape_index='{i}']")
|
| 29 |
-
if table_element: # Use a shorter name for clarity
|
| 30 |
-
props_element = table_element.find("properties")
|
| 31 |
-
if props_element is not None and props_element.text:
|
| 32 |
-
try:
|
| 33 |
-
table_data = json.loads(props_element.text)
|
| 34 |
-
apply_table_properties(sub_shape.table, table_data)
|
| 35 |
-
except (json.JSONDecodeError, KeyError) as e:
|
| 36 |
-
print(f"Error applying table properties (in group): {str(e)}")
|
| 37 |
-
|
| 38 |
-
elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
|
| 39 |
-
text_element = group_element.find(f".//text_element[@shape_index='{i}']")
|
| 40 |
-
if text_element: # Shorter name
|
| 41 |
-
props_element = text_element.find("properties")
|
| 42 |
-
if props_element is not None and props_element.text:
|
| 43 |
-
try:
|
| 44 |
-
shape_data = json.loads(props_element.text)
|
| 45 |
-
apply_shape_properties(sub_shape, shape_data)
|
| 46 |
-
except (json.JSONDecodeError, KeyError) as e:
|
| 47 |
-
print(f"Error applying shape properties (in group): {str(e)}")
|
| 48 |
-
|
| 49 |
-
def get_alignment_value(alignment_str):
|
| 50 |
-
"""Convert alignment string (with extra characters) to PP_ALIGN enum value."""
|
| 51 |
-
alignment_map = {
|
| 52 |
-
'center': PP_ALIGN.CENTER,
|
| 53 |
-
'left': PP_ALIGN.LEFT,
|
| 54 |
-
'right': PP_ALIGN.RIGHT,
|
| 55 |
-
'justify': PP_ALIGN.JUSTIFY
|
| 56 |
-
}
|
| 57 |
-
match = re.match(r"([A-Za-z]+)", alignment_str)
|
| 58 |
-
return alignment_map.get(match.group(1).lower()) if match else None
|
| 59 |
-
|
| 60 |
-
def get_vertical_anchor(value):
|
| 61 |
-
"""Converts vertical_anchor string to MSO_ANCHOR enum."""
|
| 62 |
-
mapping = {
|
| 63 |
-
"TOP": MSO_ANCHOR.TOP,
|
| 64 |
-
"MIDDLE": MSO_ANCHOR.MIDDLE,
|
| 65 |
-
"BOTTOM": MSO_ANCHOR.BOTTOM
|
| 66 |
-
}
|
| 67 |
-
return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
|
| 68 |
-
|
| 69 |
-
def get_table_properties(table):
|
| 70 |
-
"""Extract complete table properties."""
|
| 71 |
-
table_data = {
|
| 72 |
-
'rows': len(table.rows),
|
| 73 |
-
'cols': len(table.columns),
|
| 74 |
-
'cells': []
|
| 75 |
-
}
|
| 76 |
-
for row in table.rows:
|
| 77 |
-
row_data = []
|
| 78 |
-
for cell in row.cells:
|
| 79 |
-
cell_data = {
|
| 80 |
-
'text': cell.text.strip(),
|
| 81 |
-
'font_size': None,
|
| 82 |
-
'font_name': None,
|
| 83 |
-
'alignment': None,
|
| 84 |
-
'margin_left': cell.margin_left,
|
| 85 |
-
'margin_right': cell.margin_right,
|
| 86 |
-
'margin_top': cell.margin_top,
|
| 87 |
-
'margin_bottom': cell.margin_bottom,
|
| 88 |
-
'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
|
| 89 |
-
'font_color': None
|
| 90 |
-
}
|
| 91 |
-
if cell.text_frame.paragraphs:
|
| 92 |
-
paragraph = cell.text_frame.paragraphs[0]
|
| 93 |
-
if paragraph.runs:
|
| 94 |
-
run = paragraph.runs[0]
|
| 95 |
-
if hasattr(run.font, 'size') and run.font.size is not None:
|
| 96 |
-
cell_data['font_size'] = run.font.size.pt
|
| 97 |
-
if hasattr(run.font, 'name'):
|
| 98 |
-
cell_data['font_name'] = run.font.name
|
| 99 |
-
if hasattr(run.font, 'bold'):
|
| 100 |
-
cell_data['bold'] = run.font.bold
|
| 101 |
-
if hasattr(run.font, 'italic'):
|
| 102 |
-
cell_data['italic'] = run.font.italic
|
| 103 |
-
if (hasattr(run.font, 'color') and
|
| 104 |
-
run.font.color is not None and
|
| 105 |
-
hasattr(run.font.color, 'rgb') and
|
| 106 |
-
run.font.color.rgb is not None):
|
| 107 |
-
cell_data['font_color'] = str(run.font.color.rgb)
|
| 108 |
-
if hasattr(paragraph, 'alignment'):
|
| 109 |
-
cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
|
| 110 |
-
row_data.append(cell_data)
|
| 111 |
-
table_data['cells'].append(row_data)
|
| 112 |
-
return table_data
|
| 113 |
-
|
| 114 |
-
def get_shape_properties(shape):
|
| 115 |
-
"""Extract all properties from a shape, with detailed debug prints."""
|
| 116 |
-
shape_data = {
|
| 117 |
-
'text': '',
|
| 118 |
-
'font_size': None,
|
| 119 |
-
'font_name': None,
|
| 120 |
-
'alignment': None,
|
| 121 |
-
'width': shape.width,
|
| 122 |
-
'height': shape.height,
|
| 123 |
-
'left': shape.left,
|
| 124 |
-
'top': shape.top,
|
| 125 |
-
'bold': None,
|
| 126 |
-
'italic': None,
|
| 127 |
-
'line_spacing_info': {
|
| 128 |
-
'rule': None,
|
| 129 |
-
'value': None
|
| 130 |
-
},
|
| 131 |
-
'space_before': None,
|
| 132 |
-
'space_after': None,
|
| 133 |
-
'font_color': None
|
| 134 |
-
}
|
| 135 |
-
|
| 136 |
-
if hasattr(shape, "text"):
|
| 137 |
-
shape_data['text'] = shape.text.strip()
|
| 138 |
-
if hasattr(shape, 'text_frame'):
|
| 139 |
-
for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
|
| 140 |
-
if paragraph.runs:
|
| 141 |
-
run = paragraph.runs[0] # Assuming properties are mostly consistent in the first run
|
| 142 |
-
if hasattr(run.font, 'size') and run.font.size is not None:
|
| 143 |
-
shape_data['font_size'] = run.font.size.pt
|
| 144 |
-
if hasattr(run.font, 'name'):
|
| 145 |
-
shape_data['font_name'] = run.font.name
|
| 146 |
-
if hasattr(run.font, 'bold'):
|
| 147 |
-
shape_data['bold'] = run.font.bold
|
| 148 |
-
if hasattr(run.font, 'italic'):
|
| 149 |
-
shape_data['italic'] = run.font.italic
|
| 150 |
-
if (hasattr(run.font, 'color') and
|
| 151 |
-
run.font.color is not None and
|
| 152 |
-
hasattr(run.font.color, 'rgb') and
|
| 153 |
-
run.font.color.rgb is not None):
|
| 154 |
-
shape_data['font_color'] = str(run.font.color.rgb)
|
| 155 |
-
|
| 156 |
-
if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
|
| 157 |
-
shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
|
| 158 |
-
if hasattr(paragraph, 'space_before'):
|
| 159 |
-
shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
|
| 160 |
-
if hasattr(paragraph, 'space_after'):
|
| 161 |
-
shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
|
| 162 |
-
|
| 163 |
-
if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
|
| 164 |
-
line_spacing = paragraph.line_spacing
|
| 165 |
-
|
| 166 |
-
# Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
|
| 167 |
-
if isinstance(line_spacing, Pt) or line_spacing > 10:
|
| 168 |
-
line_spacing_rule = "EXACTLY"
|
| 169 |
-
elif isinstance(line_spacing, float):
|
| 170 |
-
line_spacing_rule = "MULTIPLE"
|
| 171 |
-
else:
|
| 172 |
-
line_spacing_rule = "UNKNOWN"
|
| 173 |
-
|
| 174 |
-
shape_data['line_spacing_info'] = {
|
| 175 |
-
'rule': line_spacing_rule,
|
| 176 |
-
'value': line_spacing if isinstance(line_spacing, float) else None
|
| 177 |
-
}
|
| 178 |
-
|
| 179 |
-
return shape_data
|
| 180 |
-
|
| 181 |
-
def apply_shape_properties(shape, shape_data):
|
| 182 |
-
"""Apply saved properties to a shape."""
|
| 183 |
-
try:
|
| 184 |
-
shape.width = shape_data['width']
|
| 185 |
-
shape.height = shape_data['height']
|
| 186 |
-
shape.left = shape_data['left']
|
| 187 |
-
shape.top = shape_data['top']
|
| 188 |
-
shape.text = ""
|
| 189 |
-
paragraph = shape.text_frame.paragraphs[0]
|
| 190 |
-
run = paragraph.add_run()
|
| 191 |
-
run.text = shape_data['text']
|
| 192 |
-
if shape_data['font_size']:
|
| 193 |
-
adjusted_size = shape_data['font_size'] * 0.9
|
| 194 |
-
run.font.size = Pt(adjusted_size)
|
| 195 |
-
|
| 196 |
-
if shape_data.get('font_name'):
|
| 197 |
-
run.font.name = shape_data['font_name']
|
| 198 |
-
else:
|
| 199 |
-
run.font.name = "Arial"
|
| 200 |
-
if shape_data.get('font_color'):
|
| 201 |
-
run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
|
| 202 |
-
if shape_data['bold'] is not None:
|
| 203 |
-
run.font.bold = shape_data['bold']
|
| 204 |
-
if shape_data['italic'] is not None:
|
| 205 |
-
run.font.italic = shape_data['italic']
|
| 206 |
-
if shape_data['alignment']:
|
| 207 |
-
paragraph.alignment = get_alignment_value(shape_data['alignment'])
|
| 208 |
-
|
| 209 |
-
line_spacing_info = shape_data.get('line_spacing_info', {})
|
| 210 |
-
line_spacing_rule = line_spacing_info.get('rule')
|
| 211 |
-
line_spacing_value = line_spacing_info.get('value')
|
| 212 |
-
|
| 213 |
-
if line_spacing_rule and line_spacing_value is not None:
|
| 214 |
-
if line_spacing_rule == "EXACTLY":
|
| 215 |
-
paragraph.line_spacing = Pt(line_spacing_value)
|
| 216 |
-
elif line_spacing_rule == "AT_LEAST":
|
| 217 |
-
paragraph.line_spacing = Pt(line_spacing_value)
|
| 218 |
-
elif line_spacing_rule == "MULTIPLE":
|
| 219 |
-
paragraph.line_spacing = line_spacing_value
|
| 220 |
-
else:
|
| 221 |
-
print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
|
| 222 |
-
|
| 223 |
-
if shape_data['space_before']:
|
| 224 |
-
paragraph.space_before = shape_data['space_before']
|
| 225 |
-
if shape_data['space_after']:
|
| 226 |
-
paragraph.space_after = shape_data['space_after']
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
except Exception as e:
|
| 230 |
-
print(f"Error applying shape properties: {str(e)}")
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
def apply_table_properties(table, table_data):
|
| 234 |
-
"""Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
|
| 235 |
-
for row_idx, row in enumerate(table.rows):
|
| 236 |
-
for col_idx, cell in enumerate(row.cells):
|
| 237 |
-
try:
|
| 238 |
-
cell_data = table_data['cells'][row_idx][col_idx]
|
| 239 |
-
|
| 240 |
-
# Áp dụng margin
|
| 241 |
-
cell.margin_left = cell_data.get('margin_left', 0)
|
| 242 |
-
cell.margin_right = cell_data.get('margin_right', 0)
|
| 243 |
-
cell.margin_top = cell_data.get('margin_top', 0)
|
| 244 |
-
cell.margin_bottom = cell_data.get('margin_bottom', 0)
|
| 245 |
-
|
| 246 |
-
# Áp dụng vertical_anchor (tránh dùng eval)
|
| 247 |
-
if 'vertical_anchor' in cell_data:
|
| 248 |
-
cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
|
| 249 |
-
|
| 250 |
-
# Xóa nội dung cũ và thiết lập văn bản mới
|
| 251 |
-
cell.text = ""
|
| 252 |
-
paragraph = cell.text_frame.paragraphs[0]
|
| 253 |
-
run = paragraph.add_run()
|
| 254 |
-
run.text = cell_data.get('text', "")
|
| 255 |
-
|
| 256 |
-
# Thiết lập kích thước font
|
| 257 |
-
if 'font_size' in cell_data:
|
| 258 |
-
adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
|
| 259 |
-
run.font.size = Pt(adjusted_size)
|
| 260 |
-
|
| 261 |
-
# Thiết lập font chữ
|
| 262 |
-
run.font.name = cell_data.get('font_name', 'Arial')
|
| 263 |
-
|
| 264 |
-
# Màu chữ
|
| 265 |
-
if 'font_color' in cell_data:
|
| 266 |
-
run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
|
| 267 |
-
|
| 268 |
-
# In đậm & in nghiêng
|
| 269 |
-
run.font.bold = cell_data.get('bold', False)
|
| 270 |
-
run.font.italic = cell_data.get('italic', False)
|
| 271 |
-
|
| 272 |
-
# Căn lề văn bản
|
| 273 |
-
if 'alignment' in cell_data:
|
| 274 |
-
paragraph.alignment = get_alignment_value(cell_data['alignment'])
|
| 275 |
-
|
| 276 |
-
except Exception as e:
|
| 277 |
-
print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
def get_file_from_mongodb(db_name, collection_name, file_id):
|
| 281 |
-
"""Tải tệp từ MongoDB GridFS"""
|
| 282 |
-
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 283 |
-
db = client[db_name]
|
| 284 |
-
fs = GridFS(db, collection_name)
|
| 285 |
-
file_data = fs.get(file_id)
|
| 286 |
-
return file_data
|
| 287 |
-
# return BytesIO(file_data.read())
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
| 291 |
-
"""Lưu tệp vào MongoDB GridFS"""
|
| 292 |
-
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 293 |
-
db = client[db_name]
|
| 294 |
-
fs = GridFS(db, collection_name)
|
| 295 |
-
file_id = fs.put(file_data, filename=file_name)
|
| 296 |
-
client.close()
|
| 297 |
-
return file_id
|
| 298 |
-
|
| 299 |
-
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
| 300 |
-
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
| 301 |
-
try:
|
| 302 |
-
# Kết nối MongoDB và tải file
|
| 303 |
-
original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
|
| 304 |
-
translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
|
| 305 |
-
|
| 306 |
-
# Load PowerPoint gốc và XML dịch
|
| 307 |
-
prs = Presentation(BytesIO(original_ppt.read()))
|
| 308 |
-
tree = ET.parse(BytesIO(translated_xml.read()))
|
| 309 |
-
root = tree.getroot()
|
| 310 |
-
|
| 311 |
-
# Áp dụng bản dịch
|
| 312 |
-
for slide_number, slide in enumerate(prs.slides, 1):
|
| 313 |
-
xml_slide = root.find(f".//slide[@number='{slide_number}']")
|
| 314 |
-
if xml_slide is None:
|
| 315 |
-
continue
|
| 316 |
-
for shape_index, shape in enumerate(slide.shapes):
|
| 317 |
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
| 318 |
-
apply_group_properties_recursive(shape, shape_index, xml_slide)
|
| 319 |
-
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
| 320 |
-
table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
|
| 321 |
-
if table_element is not None:
|
| 322 |
-
props_element = table_element.find("properties")
|
| 323 |
-
if props_element is not None and props_element.text:
|
| 324 |
-
try:
|
| 325 |
-
table_data = json.loads(props_element.text)
|
| 326 |
-
apply_table_properties(shape.table, table_data)
|
| 327 |
-
except Exception as e:
|
| 328 |
-
print(f"Error applying table properties: {str(e)}")
|
| 329 |
-
elif hasattr(shape, "text"):
|
| 330 |
-
text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
|
| 331 |
-
if text_element is not None:
|
| 332 |
-
props_element = text_element.find("properties")
|
| 333 |
-
if props_element is not None and props_element.text:
|
| 334 |
-
try:
|
| 335 |
-
shape_data = json.loads(props_element.text)
|
| 336 |
-
apply_shape_properties(shape, shape_data)
|
| 337 |
-
except Exception as e:
|
| 338 |
-
print(f"Error applying shape properties: {str(e)}")
|
| 339 |
-
|
| 340 |
-
# Lưu PowerPoint vào MongoDB với tên gốc
|
| 341 |
-
output_io = BytesIO()
|
| 342 |
-
prs.save(output_io)
|
| 343 |
-
output_io.seek(0) # Reset vị trí đọc
|
| 344 |
-
|
| 345 |
-
# Giữ nguyên tên file gốc, thêm hậu tố "_translated"
|
| 346 |
-
translated_filename = original_ppt.filename.replace(".xml", ".pptx")
|
| 347 |
-
|
| 348 |
-
file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
|
| 349 |
-
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
| 350 |
-
|
| 351 |
-
return file_id
|
| 352 |
-
except Exception as e:
|
| 353 |
-
print(f"Error creating translated PowerPoint: {str(e)}")
|
| 354 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
powerpoint/pptx_processor.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
# ppt_processor.py
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from xml_handling import ppt_to_xml, translate_xml_file
|
| 4 |
-
from pptx_object import create_translated_ppt
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
|
| 8 |
-
"""Process a single PPT/PPTX file from XML extraction to final translation."""
|
| 9 |
-
ppt_path = ppt_path.strip("'\"")
|
| 10 |
-
ppt_path = ppt_path.replace("\\ ", " ")
|
| 11 |
-
ppt_path = ppt_path.replace("\\'", "'")
|
| 12 |
-
ppt_path = os.path.expanduser(ppt_path)
|
| 13 |
-
ppt_path = Path(ppt_path).resolve()
|
| 14 |
-
# chuyển thành link DB trên server
|
| 15 |
-
try:
|
| 16 |
-
if not ppt_path.is_file():
|
| 17 |
-
print(f"Error: '{ppt_path}' is not a valid file.")
|
| 18 |
-
return
|
| 19 |
-
if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
|
| 20 |
-
print(f"Error: '{ppt_path}' is not a PowerPoint file.")
|
| 21 |
-
return
|
| 22 |
-
|
| 23 |
-
base_dir = ppt_path.parent
|
| 24 |
-
|
| 25 |
-
# Original XML
|
| 26 |
-
print(f"Generating original XML for {ppt_path.name}...")
|
| 27 |
-
original_xml = ppt_to_xml(str(ppt_path))
|
| 28 |
-
if original_xml:
|
| 29 |
-
original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
|
| 30 |
-
with open(original_output_path, 'w', encoding='utf-8') as f:
|
| 31 |
-
f.write(original_xml)
|
| 32 |
-
print(f"Original XML saved: {original_output_path}")
|
| 33 |
-
|
| 34 |
-
# Save original XML to MongoDB
|
| 35 |
-
# save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
|
| 36 |
-
|
| 37 |
-
# Translated XML
|
| 38 |
-
print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
|
| 39 |
-
translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
|
| 40 |
-
original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
|
| 41 |
-
translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
|
| 42 |
-
|
| 43 |
-
# Create Translated PPT
|
| 44 |
-
print(f"Creating translated PPT for {ppt_path.name}...")
|
| 45 |
-
output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
|
| 46 |
-
output_ppt_path = base_dir / output_filename
|
| 47 |
-
create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
|
| 48 |
-
|
| 49 |
-
except Exception as e:
|
| 50 |
-
print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|