Spaces:

mintlee
/

MT_deploy

Sleeping

App Files Files Community

mintlee commited on Apr 4

Commit

4d84219

1 Parent(s): 73196e5

update xlsx

Browse files

Files changed (7) hide show

excel/__pycache__/xlsx.cpython-310.pyc +0 -0
excel/xlsx.py +430 -0
pages/upload.py +5 -10
powerpoint/__pycache__/pptx.cpython-310.pyc +0 -0
powerpoint/pptx.py +0 -2
powerpoint/pptx_object.py +0 -354
powerpoint/pptx_processor.py +0 -50

excel/__pycache__/xlsx.cpython-310.pyc ADDED Viewed

Binary file (11.3 kB). View file

excel/xlsx.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import os
+import zipfile
+import copy
+import time
+import xml.etree.ElementTree as ET
+from typing import List, Dict, Any, Optional, Tuple
+from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text
+from pymongo import MongoClient
+import gridfs
+from io import BytesIO
+import shutil
+import io
+NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
+# --- Hàm đăng ký namespace (quan trọng khi ghi file) ---
+def register_namespaces(xml_file):
+    """Đọc và đăng ký các namespace từ file XML."""
+    namespaces = dict([
+        node for _, node in ET.iterparse(xml_file, events=['start-ns'])
+    ])
+    for ns, uri in namespaces.items():
+        ET.register_namespace(ns, uri)
+    # Đăng ký thêm namespace phổ biến nếu chưa có
+    if 'main' not in namespaces and '' not in namespaces: # Kiểm tra cả prefix rỗng
+         ET.register_namespace('', NS_MAIN['main']) # Đăng ký default namespace
+    elif 'main' not in namespaces:
+         ET.register_namespace('main', NS_MAIN['main']) # Đăng ký với prefix 'main'
+def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:
+    """
+    Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text.
+    """
+    modifiable_nodes = []
+    shared_strings_path = os.path.join(unzipped_folder_path, "xl", "sharedStrings.xml")
+    worksheets_folder = os.path.join(unzipped_folder_path, "xl", "worksheets")
+    shared_tree = None
+    sheet_trees = {}
+    # --- Xử lý sharedStrings.xml ---
+    if os.path.exists(shared_strings_path):
+        try:
+            register_namespaces(shared_strings_path)
+            shared_tree = ET.parse(shared_strings_path)
+            root_shared = shared_tree.getroot()
+            for si_element in root_shared.findall('main:si', NS_MAIN):
+                text_parts = []
+                t_elements = si_element.findall('.//main:t', NS_MAIN) # Tìm tất cả <t> con
+                # Tìm run đầu tiên (<r>) và properties (<rPr>) của nó
+                first_r = si_element.find('./main:r', NS_MAIN) # Tìm <r> con trực tiếp đầu tiên
+                first_rpr_clone = None # Lưu bản sao của <rPr> đầu tiên
+                is_rich_text = first_r is not None
+                if is_rich_text:
+                    # Tìm <rPr> bên trong <r> đầu tiên
+                    first_rpr = first_r.find('./main:rPr', NS_MAIN)
+                    if first_rpr is not None:
+                        # Sao chép sâu để không ảnh hưởng cây gốc và để dùng sau
+                        first_rpr_clone = copy.deepcopy(first_rpr)
+                # Lấy toàn bộ text
+                for t_node in t_elements:
+                    if t_node.text:
+                        text_parts.append(t_node.text)
+                full_text = "".join(text_parts)
+                if not full_text: continue # Bỏ qua nếu không có text
+                if is_rich_text:
+                    modifiable_nodes.append({
+                        'type': 'shared_rich',
+                        'original_text': full_text,
+                        'element': si_element,         # Tham chiếu <si>
+                        'first_format': first_rpr_clone, # Lưu định dạng <rPr> đầu tiên (hoặc None)
+                        'source_file': os.path.join("xl", "sharedStrings.xml"),
+                        'sheet_name': None
+                    })
+                elif t_elements: # Không phải rich text, tìm thẻ <t> đơn giản
+                    first_t = si_element.find('./main:t', NS_MAIN)
+                    if first_t is not None:
+                        modifiable_nodes.append({
+                            'type': 'shared_simple',
+                            'original_text': full_text,
+                            'element': first_t,          # Tham chiếu <t>
+                            'first_format': None,        # Không có định dạng đặc biệt
+                            'source_file': os.path.join("xl", "sharedStrings.xml"),
+                            'sheet_name': None
+                        })
+        except Exception as e:
+            print(f"Lỗi xử lý sharedStrings: {e}")
+            import traceback
+            traceback.print_exc()
+    # --- Xử lý các file sheetX.xml (Inline Strings - không có định dạng phức tạp) ---
+    if os.path.isdir(worksheets_folder):
+        for sheet_filename in sorted(os.listdir(worksheets_folder)):
+             if sheet_filename.lower().endswith(".xml"):
+                # ... (phần đọc và parse sheet tree như cũ) ...
+                sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
+                try:
+                    register_namespaces(sheet_file_path)
+                    sheet_tree = ET.parse(sheet_file_path)
+                    sheet_trees[sheet_filename] = sheet_tree
+                    root_sheet = sheet_tree.getroot()
+                    for cell in root_sheet.findall('.//main:c[@t="inlineStr"]', NS_MAIN):
+                        t_element = cell.find('.//main:is/main:t', NS_MAIN)
+                        if t_element is not None and t_element.text is not None:
+                             modifiable_nodes.append({
+                                'type': 'inline',
+                                'original_text': t_element.text,
+                                'element': t_element, # Tham chiếu <t>
+                                'first_format': None, # Inline string không có định dạng <rPr>
+                                'source_file': os.path.join("xl", "worksheets", sheet_filename),
+                                'sheet_name': sheet_filename
+                             })
+                except Exception as e:
+                     print(f"Lỗi xử lý sheet {sheet_filename}: {e}")
+                     import traceback
+                     traceback.print_exc()
+    else:
+        print(f"Lỗi: Không tìm thấy thư mục worksheets: {worksheets_folder}")
+    global_data = {"shared_tree": shared_tree, "sheet_trees": sheet_trees, "shared_strings_path": shared_strings_path, "worksheets_folder": worksheets_folder}
+    return modifiable_nodes, global_data
+def apply_and_save_changes(modified_nodes_data: List[Dict[str, Any]], global_data: Dict[str, Any]) -> bool:
+    """
+    Cập nhật text, giữ lại định dạng đầu tiên cho Rich Text, và lưu file XML.
+    """
+    if not global_data: print("Lỗi: Thiếu global_data."); return False
+    updated_files = set()
+    try: ET.register_namespace('xml', "http://www.w3.org/XML/1998/namespace")
+    except ValueError: pass
+    for node_info in modified_nodes_data:
+        if 'modified_text' in node_info and node_info['element'] is not None:
+            element = node_info['element']
+            modified_text = node_info['modified_text']
+            original_text = node_info.get('original_text', '')
+            node_type = node_info.get('type', '')
+            first_format = node_info.get('first_format') # Lấy <rPr> đã lưu (hoặc None)
+            if original_text != modified_text:
+                # --- Xử lý Rich Text: Tạo lại cấu trúc <si><r>[<rPr>]<t></r></si> ---
+                if node_type == 'shared_rich':
+                    si_element = element
+                    # Xóa con cũ
+                    for child in list(si_element):
+                        si_element.remove(child)
+                    # Tạo run mới <r>
+                    new_r = ET.Element(f"{{{NS_MAIN['main']}}}r")
+                    # Nếu có định dạng đầu tiên (<rPr>), thêm nó vào <r> mới
+                    if first_format is not None:
+                        new_r.append(first_format) # Thêm bản sao <rPr> đã lưu
+                    # Tạo thẻ text mới <t>
+                    new_t = ET.Element(f"{{{NS_MAIN['main']}}}t")
+                    new_t.text = modified_text
+                    xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
+                    new_t.set(xml_space_attr, 'preserve')
+                    # Thêm <t> vào <r>
+                    new_r.append(new_t)
+                    # Thêm <r> vào <si>
+                    si_element.append(new_r)
+                    updated_files.add(node_info['source_file'])
+                    # print(f"Applied first format to Rich Text in {node_info['source_file']}")
+                # --- Xử lý Simple/Inline Text: Cập nhật thẻ <t> ---
+                elif node_type in ['shared_simple', 'inline']:
+                    t_element = element
+                    t_element.text = modified_text
+                    xml_space_attr = '{http://www.w3.org/XML/1998/namespace}space'
+                    if xml_space_attr not in t_element.attrib or t_element.attrib[xml_space_attr] != 'preserve':
+                        t_element.set(xml_space_attr, 'preserve')
+                    updated_files.add(node_info['source_file'])
+                    # print(f"Updated Simple/Inline Text in {node_info['source_file']}")
+                else:
+                     print(f"Cảnh báo: Loại node không xác định '{node_type}'")
+    # --- Lưu lại các file XML đã thay đổi (Giữ nguyên) ---
+    success = True
+    # ... (Phần code lưu file như cũ) ...
+    shared_tree = global_data.get("shared_tree"); shared_strings_path = global_data.get("shared_strings_path")
+    sheet_trees = global_data.get("sheet_trees", {}); worksheets_folder = global_data.get("worksheets_folder")
+    shared_strings_relative_path = os.path.join("xl", "sharedStrings.xml")
+    if shared_tree and shared_strings_path and shared_strings_relative_path in updated_files:
+        try:
+            # print(f"Saving modified file: {shared_strings_path}")
+            shared_tree.write(shared_strings_path, encoding='utf-8', xml_declaration=True)
+        except Exception as e: print(f"Lỗi lưu {shared_strings_path}: {e}"); success = False
+    if worksheets_folder and os.path.exists(worksheets_folder):
+        for sheet_filename, sheet_tree in sheet_trees.items():
+            sheet_relative_path = os.path.join("xl", "worksheets", sheet_filename)
+            if sheet_relative_path in updated_files:
+                sheet_file_path = os.path.join(worksheets_folder, sheet_filename)
+                try:
+                    # print(f"Saving modified file: {sheet_file_path}")
+                    sheet_tree.write(sheet_file_path, encoding='utf-8', xml_declaration=True)
+                except Exception as e: print(f"Lỗi lưu {sheet_file_path}: {e}"); success = False
+    if success and updated_files: print(f"Đã lưu thành công {len(updated_files)} file XML đã sửa đổi (đã giữ lại định dạng đầu tiên cho Rich Text).")
+    elif not updated_files: print("Không có file XML nào cần cập nhật.") ; return True
+    return success
+def zip_folder_to_excel_file(folder_path, file_name):
+    try:
+        # Nén thư mục thành file .xlsx trong RAM
+        xlsx_buffer = io.BytesIO()
+        with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for root, _, files in os.walk(folder_path):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    archive_path = os.path.relpath(file_path, folder_path)
+                    zipf.write(file_path, archive_path)
+        xlsx_buffer.seek(0)
+        client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
+        db = client['excel']
+        fs = gridfs.GridFS(db, collection='final_file')
+        file_id = fs.put(xlsx_buffer.read(), filename=file_name)
+        print(f"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}")
+        return file_id
+    except Exception as e:
+        print(f"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}")
+        return None
+def get_text_list_from_nodes(modifiable_nodes: Optional[List[Dict[str, Any]]]) -> List[str]:
+    if modifiable_nodes is None:
+        return [] # Trả về list rỗng nếu đầu vào là None
+    # Sử dụng list comprehension để lấy giá trị của key 'original_text' từ mỗi dictionary
+    text_list = [
+        node_info['original_text']
+        for node_info in modifiable_nodes
+        if 'original_text' in node_info and node_info['original_text'] is not None
+    ]
+    # Thêm kiểm tra 'original_text' tồn tại và không phải None cho chắc chắn
+    return text_list
+def count_words(text: str) -> int:
+    """Đếm số từ trong một chuỗi bằng cách tách theo khoảng trắng."""
+    if not text or text.isspace():
+        return 0
+    return len(text.split())
+# Helper function to process a batch of valid segments (Unchanged)
+def _translate_batch_helper(segments_to_translate, original_indices_1based, source_lang, target_lang):
+    """Handles preprocessing, translation, postprocessing, and error handling for a batch."""
+    batch_results = [None] * len(segments_to_translate)
+    if not segments_to_translate:
+        return []
+    try:
+        processed_segments = preprocess_text(segments_to_translate)
+        translated_segments = translate_text(processed_segments, source_lang, target_lang)
+        final_translated_segments = postprocess_text(translated_segments)
+        if len(final_translated_segments) == len(segments_to_translate):
+            batch_results = final_translated_segments
+        else:
+            print(f"    *** CRITICAL ERROR: Batch translation result count mismatch! Expected {len(segments_to_translate)}, got {len(final_translated_segments)}. Marking batch as failed.")
+            error_msg = "<translation_length_mismatch_error>"
+            batch_results = [error_msg] * len(segments_to_translate)
+    except Exception as e:
+        print(f"    *** ERROR during batch translation: {e}. Marking batch as failed.")
+        # traceback.print_exc() # Uncomment for detailed debug
+        error_msg = "<translation_api_error>"
+        batch_results = [error_msg] * len(segments_to_translate)
+    return batch_results
+def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):
+    """
+    Dịch file XLSX, chia thành batch động, dịch riêng các segment quá dài.
+    Args:
+        input_filepath (str): Đường dẫn đến file XLSX đầu vào.
+        output_filepath (str): Đường dẫn để lưu file XLSX đã dịch.
+        source_lang (str): Mã ngôn ngữ nguồn.
+        target_lang (str): Mã ngôn ngữ đích.
+        batch_size_segments (int): Số lượng đoạn text tối đa MONG MUỐN trong mỗi lần gọi API.
+        max_words_per_segment (int): Giới hạn từ tối đa cho một segment để được dịch theo batch.
+                                     Các segment dài hơn sẽ được dịch riêng lẻ.
+        delay_between_requests (int): Thời gian chờ (giây) giữa các lần gọi API dịch.
+    """
+    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client['excel']
+    fs = gridfs.GridFS(db, collection='root_file')
+    ppt_file = fs.get(file_id)
+    excel_file = BytesIO(ppt_file.read())
+    xml_folder = unzip_office_file(excel_file)
+    modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)
+    original_texts = get_text_list_from_nodes(modifiable_nodes)
+    all_results = [None] * len(original_texts)
+    current_index = 0
+    processed_count = 0
+    api_call_counter = 0 # Track API calls for delay logic
+    while current_index < len(original_texts):
+        batch_texts_to_translate = []
+        batch_original_indices = [] # 0-based indices for assignment
+        batch_end_index = min(current_index + batch_size_segments, len(original_texts))
+        found_long_segment_at = -1 # 0-based index in original_texts
+        # 1. Build the next potential batch, stopping if a long segment is found
+        for i in range(current_index, batch_end_index):
+            segment = original_texts[i]
+            word_count = count_words(segment)
+            if word_count <= max_words_per_segment:
+                batch_texts_to_translate.append(segment)
+                batch_original_indices.append(i)
+            else:
+                found_long_segment_at = i
+                break # Stop building this batch
+        # --- Process the findings ---
+        # 2. Translate the VALID batch collected *before* the long segment (if any)
+        if batch_texts_to_translate:
+            # Add delay BEFORE the API call if it's not the very first call
+            if api_call_counter > 0 and delay_between_requests > 0:
+                    time.sleep(delay_between_requests)
+            translated_batch = _translate_batch_helper(
+                batch_texts_to_translate,
+                [idx + 1 for idx in batch_original_indices], # 1-based for logging
+                source_lang,
+                target_lang
+            )
+            api_call_counter += 1
+            # Assign results back
+            for batch_idx, original_idx in enumerate(batch_original_indices):
+                all_results[original_idx] = translated_batch[batch_idx]
+            processed_count += len(batch_texts_to_translate)
+        # 3. Handle the long segment INDIVIDUALLY (if one was found)
+        if found_long_segment_at != -1:
+            long_segment_index = found_long_segment_at
+            long_segment_text = str(original_texts[long_segment_index])
+            # word_count = count_words(long_segment_text) # Recalculate for log clarity
+            try:
+                translated = translate_single_text(long_segment_text, source_lang, target_lang)
+                final = [translated]
+                api_call_counter += 1
+                if len(final) == 1:
+                    all_results[long_segment_index] = final[0]
+                else:
+                    print(f"    *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.")
+                    all_results[long_segment_index] = "<translation_length_mismatch_error>"
+            except Exception as e:
+                print(f"    *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.")
+                # traceback.print_exc() # Uncomment for detailed debug
+                all_results[long_segment_index] = "<translation_api_error>"
+                # Do not increment api_call_counter if the API call itself failed before returning
+            processed_count += 1
+            # Update current_index to start AFTER this long segment
+            current_index = long_segment_index + 1
+        else:
+            # No long segment was found in the range checked.
+            # Move current_index to the end of the range examined.
+            current_index = batch_end_index
+    missing_count = 0
+    final_texts_for_nodes = []
+    for i, res in enumerate(all_results):
+            if res is None:
+                print(f"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'")
+                final_texts_for_nodes.append(original_texts[i])
+                missing_count += 1
+            else:
+                final_texts_for_nodes.append(res)
+    if missing_count > 0:
+            print(f"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.")
+    if len(final_texts_for_nodes) != len(original_texts):
+        print(f"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.")
+    else:
+        # Gán vào node
+        for i, node_info in enumerate(modifiable_nodes):
+            node_info['modified_text'] = final_texts_for_nodes[i]
+        save_success = apply_and_save_changes(modifiable_nodes, global_data)
+        if not save_success:
+            print("LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.")
+        else:
+            # Only zip if saving XML was successful
+            final_id = zip_folder_to_excel_file(xml_folder, file_name)
+            if final_id:
+                shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping
+            else:
+                print("LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.")
+    return final_id

pages/upload.py CHANGED Viewed

@@ -2,7 +2,7 @@ import streamlit as st
 import google.generativeai as genai
 from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
 from powerpoint.pptx import translate_pptx
-from excel.excel_translate import translate_xlsx, translate_csv
 from word.word_translate import translate_docx_from_mongodb
 import dotenv
 import os
@@ -26,16 +26,11 @@ def process_file(file, file_type):
     if file_type == "PPTX":
         final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
-        # progress_bar.progress(40)
-        # text_dict = extract_text_from_xml(file_id=xml_file_id)
-        # translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
-        # progress_bar.progress(60)
-        # final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
-        # final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
     elif file_type == "Excel":
-        final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
-    elif file_type == "CSV":
-        final_id = translate_csv(file_id = file_id, target_lang = target_lang)
     elif file_type == "Word":
         final_id = translate_docx_from_mongodb(file_id, target_lang)
     else:

 import google.generativeai as genai
 from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
 from powerpoint.pptx import translate_pptx
+from excel.xlsx import translate_xlsx
 from word.word_translate import translate_docx_from_mongodb
 import dotenv
 import os
     if file_type == "PPTX":
         final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
+        progress_bar.progress(60)
     elif file_type == "Excel":
+        final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
+    # elif file_type == "CSV":
+    #     final_id = translate_csv(file_id = file_id, target_lang = target_lang)
     elif file_type == "Word":
         final_id = translate_docx_from_mongodb(file_id, target_lang)
     else:

powerpoint/__pycache__/pptx.cpython-310.pyc CHANGED Viewed

Binary files a/powerpoint/__pycache__/pptx.cpython-310.pyc and b/powerpoint/__pycache__/pptx.cpython-310.pyc differ

powerpoint/pptx.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import os
 import zipfile
 import shutil
-from pptx import Presentation
 from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
 from powerpoint.xml_handling import *
 from pymongo import MongoClient
 import gridfs
-from bson import ObjectId
 from io import BytesIO
 def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):

 import os
 import zipfile
 import shutil
 from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
 from powerpoint.xml_handling import *
 from pymongo import MongoClient
 import gridfs
 from io import BytesIO
 def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):

powerpoint/pptx_object.py DELETED Viewed

@@ -1,354 +0,0 @@
-# ppt_objects.py
-from pptx import Presentation
-from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
-from pptx.enum.shapes import MSO_SHAPE_TYPE
-import xml.etree.ElementTree as ET
-from pptx.util import Pt
-from pptx.dml.color import RGBColor
-import re
-import json
-from pymongo import MongoClient
-from gridfs import GridFS
-import json
-import xml.etree.ElementTree as ET
-from io import BytesIO
-def apply_group_properties_recursive(shape, shape_index, parent_element):
-    """Recursively applies properties to shapes within groups."""
-    if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
-        group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
-        if group_element is not None:
-            for i, sub_shape in enumerate(shape.shapes):
-                apply_group_properties_recursive(sub_shape, i, group_element)
-                # Apply properties for sub-shapes WITHIN the group, based on their type.
-                if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
-                    table_element = group_element.find(f".//table_element[@shape_index='{i}']")
-                    if table_element:  # Use a shorter name for clarity
-                        props_element = table_element.find("properties")
-                        if props_element is not None and props_element.text:
-                            try:
-                                table_data = json.loads(props_element.text)
-                                apply_table_properties(sub_shape.table, table_data)
-                            except (json.JSONDecodeError, KeyError) as e:
-                                print(f"Error applying table properties (in group): {str(e)}")
-                elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
-                    text_element = group_element.find(f".//text_element[@shape_index='{i}']")
-                    if text_element:  # Shorter name
-                        props_element = text_element.find("properties")
-                        if props_element is not None and props_element.text:
-                            try:
-                                shape_data = json.loads(props_element.text)
-                                apply_shape_properties(sub_shape, shape_data)
-                            except (json.JSONDecodeError, KeyError) as e:
-                                print(f"Error applying shape properties (in group): {str(e)}")
-def get_alignment_value(alignment_str):
-    """Convert alignment string (with extra characters) to PP_ALIGN enum value."""
-    alignment_map = {
-        'center': PP_ALIGN.CENTER,
-        'left': PP_ALIGN.LEFT,
-        'right': PP_ALIGN.RIGHT,
-        'justify': PP_ALIGN.JUSTIFY
-    }
-    match = re.match(r"([A-Za-z]+)", alignment_str)
-    return alignment_map.get(match.group(1).lower()) if match else None
-def get_vertical_anchor(value):
-    """Converts vertical_anchor string to MSO_ANCHOR enum."""
-    mapping = {
-        "TOP": MSO_ANCHOR.TOP,
-        "MIDDLE": MSO_ANCHOR.MIDDLE,
-        "BOTTOM": MSO_ANCHOR.BOTTOM
-    }
-    return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
-def get_table_properties(table):
-    """Extract complete table properties."""
-    table_data = {
-        'rows': len(table.rows),
-        'cols': len(table.columns),
-        'cells': []
-    }
-    for row in table.rows:
-        row_data = []
-        for cell in row.cells:
-            cell_data = {
-                'text': cell.text.strip(),
-                'font_size': None,
-                'font_name': None,
-                'alignment': None,
-                'margin_left': cell.margin_left,
-                'margin_right': cell.margin_right,
-                'margin_top': cell.margin_top,
-                'margin_bottom': cell.margin_bottom,
-                'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
-                'font_color': None
-            }
-            if cell.text_frame.paragraphs:
-                paragraph = cell.text_frame.paragraphs[0]
-                if paragraph.runs:
-                    run = paragraph.runs[0]
-                    if hasattr(run.font, 'size') and run.font.size is not None:
-                        cell_data['font_size'] = run.font.size.pt
-                    if hasattr(run.font, 'name'):
-                        cell_data['font_name'] = run.font.name
-                    if hasattr(run.font, 'bold'):
-                        cell_data['bold'] = run.font.bold
-                    if hasattr(run.font, 'italic'):
-                        cell_data['italic'] = run.font.italic
-                    if (hasattr(run.font, 'color') and
-                        run.font.color is not None and
-                        hasattr(run.font.color, 'rgb') and
-                        run.font.color.rgb is not None):
-                        cell_data['font_color'] = str(run.font.color.rgb)
-                if hasattr(paragraph, 'alignment'):
-                    cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
-            row_data.append(cell_data)
-        table_data['cells'].append(row_data)
-    return table_data
-def get_shape_properties(shape):
-    """Extract all properties from a shape, with detailed debug prints."""
-    shape_data = {
-        'text': '',
-        'font_size': None,
-        'font_name': None,
-        'alignment': None,
-        'width': shape.width,
-        'height': shape.height,
-        'left': shape.left,
-        'top': shape.top,
-        'bold': None,
-        'italic': None,
-        'line_spacing_info': {
-            'rule': None,
-            'value': None
-        },
-        'space_before': None,
-        'space_after': None,
-        'font_color': None
-    }
-    if hasattr(shape, "text"):
-        shape_data['text'] = shape.text.strip()
-        if hasattr(shape, 'text_frame'):
-            for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
-                if paragraph.runs:
-                    run = paragraph.runs[0]  # Assuming properties are mostly consistent in the first run
-                    if hasattr(run.font, 'size') and run.font.size is not None:
-                        shape_data['font_size'] = run.font.size.pt
-                    if hasattr(run.font, 'name'):
-                        shape_data['font_name'] = run.font.name
-                    if hasattr(run.font, 'bold'):
-                        shape_data['bold'] = run.font.bold
-                    if hasattr(run.font, 'italic'):
-                        shape_data['italic'] = run.font.italic
-                    if (hasattr(run.font, 'color') and
-                        run.font.color is not None and
-                        hasattr(run.font.color, 'rgb') and
-                        run.font.color.rgb is not None):
-                        shape_data['font_color'] = str(run.font.color.rgb)
-                if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
-                    shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
-                if hasattr(paragraph, 'space_before'):
-                    shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
-                if hasattr(paragraph, 'space_after'):
-                    shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
-                if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
-                    line_spacing = paragraph.line_spacing
-                    # Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
-                    if isinstance(line_spacing, Pt) or line_spacing > 10:
-                        line_spacing_rule = "EXACTLY"
-                    elif isinstance(line_spacing, float):
-                        line_spacing_rule = "MULTIPLE"
-                    else:
-                        line_spacing_rule = "UNKNOWN"
-                    shape_data['line_spacing_info'] = {
-                        'rule': line_spacing_rule,
-                        'value': line_spacing if isinstance(line_spacing, float) else None
-                    }
-    return shape_data
-def apply_shape_properties(shape, shape_data):
-    """Apply saved properties to a shape."""
-    try:
-        shape.width = shape_data['width']
-        shape.height = shape_data['height']
-        shape.left = shape_data['left']
-        shape.top = shape_data['top']
-        shape.text = ""
-        paragraph = shape.text_frame.paragraphs[0]
-        run = paragraph.add_run()
-        run.text = shape_data['text']
-        if shape_data['font_size']:
-            adjusted_size = shape_data['font_size'] * 0.9
-            run.font.size = Pt(adjusted_size)
-        if shape_data.get('font_name'):
-            run.font.name = shape_data['font_name']
-        else:
-            run.font.name = "Arial"
-        if shape_data.get('font_color'):
-            run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
-        if shape_data['bold'] is not None:
-            run.font.bold = shape_data['bold']
-        if shape_data['italic'] is not None:
-            run.font.italic = shape_data['italic']
-        if shape_data['alignment']:
-            paragraph.alignment = get_alignment_value(shape_data['alignment'])
-        line_spacing_info = shape_data.get('line_spacing_info', {})
-        line_spacing_rule = line_spacing_info.get('rule')
-        line_spacing_value = line_spacing_info.get('value')
-        if line_spacing_rule and line_spacing_value is not None:
-            if line_spacing_rule == "EXACTLY":
-                paragraph.line_spacing = Pt(line_spacing_value)
-            elif line_spacing_rule == "AT_LEAST":
-                paragraph.line_spacing = Pt(line_spacing_value)
-            elif line_spacing_rule == "MULTIPLE":
-                paragraph.line_spacing = line_spacing_value
-            else:
-                print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
-        if shape_data['space_before']:
-            paragraph.space_before = shape_data['space_before']
-        if shape_data['space_after']:
-            paragraph.space_after = shape_data['space_after']
-    except Exception as e:
-        print(f"Error applying shape properties: {str(e)}")
-def apply_table_properties(table, table_data):
-    """Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
-    for row_idx, row in enumerate(table.rows):
-        for col_idx, cell in enumerate(row.cells):
-            try:
-                cell_data = table_data['cells'][row_idx][col_idx]
-                # Áp dụng margin
-                cell.margin_left = cell_data.get('margin_left', 0)
-                cell.margin_right = cell_data.get('margin_right', 0)
-                cell.margin_top = cell_data.get('margin_top', 0)
-                cell.margin_bottom = cell_data.get('margin_bottom', 0)
-                # Áp dụng vertical_anchor (tránh dùng eval)
-                if 'vertical_anchor' in cell_data:
-                    cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
-                # Xóa nội dung cũ và thiết lập văn bản mới
-                cell.text = ""
-                paragraph = cell.text_frame.paragraphs[0]
-                run = paragraph.add_run()
-                run.text = cell_data.get('text', "")
-                # Thiết lập kích thước font
-                if 'font_size' in cell_data:
-                    adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
-                    run.font.size = Pt(adjusted_size)
-                # Thiết lập font chữ
-                run.font.name = cell_data.get('font_name', 'Arial')
-                # Màu chữ
-                if 'font_color' in cell_data:
-                    run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
-                # In đậm & in nghiêng
-                run.font.bold = cell_data.get('bold', False)
-                run.font.italic = cell_data.get('italic', False)
-                # Căn lề văn bản
-                if 'alignment' in cell_data:
-                    paragraph.alignment = get_alignment_value(cell_data['alignment'])
-            except Exception as e:
-                print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
-def get_file_from_mongodb(db_name, collection_name, file_id):
-    """Tải tệp từ MongoDB GridFS"""
-    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs = GridFS(db, collection_name)
-    file_data = fs.get(file_id)
-    return file_data
-    # return BytesIO(file_data.read())
-def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
-    """Lưu tệp vào MongoDB GridFS"""
-    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs = GridFS(db, collection_name)
-    file_id = fs.put(file_data, filename=file_name)
-    client.close()
-    return file_id
-def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
-    """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
-    try:
-        # Kết nối MongoDB và tải file
-        original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
-        translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
-        # Load PowerPoint gốc và XML dịch
-        prs = Presentation(BytesIO(original_ppt.read()))
-        tree = ET.parse(BytesIO(translated_xml.read()))
-        root = tree.getroot()
-        # Áp dụng bản dịch
-        for slide_number, slide in enumerate(prs.slides, 1):
-            xml_slide = root.find(f".//slide[@number='{slide_number}']")
-            if xml_slide is None:
-                continue
-            for shape_index, shape in enumerate(slide.shapes):
-                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
-                    apply_group_properties_recursive(shape, shape_index, xml_slide)
-                elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
-                    table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
-                    if table_element is not None:
-                        props_element = table_element.find("properties")
-                        if props_element is not None and props_element.text:
-                            try:
-                                table_data = json.loads(props_element.text)
-                                apply_table_properties(shape.table, table_data)
-                            except Exception as e:
-                                print(f"Error applying table properties: {str(e)}")
-                elif hasattr(shape, "text"):
-                    text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
-                    if text_element is not None:
-                        props_element = text_element.find("properties")
-                        if props_element is not None and props_element.text:
-                            try:
-                                shape_data = json.loads(props_element.text)
-                                apply_shape_properties(shape, shape_data)
-                            except Exception as e:
-                                print(f"Error applying shape properties: {str(e)}")
-        # Lưu PowerPoint vào MongoDB với tên gốc
-        output_io = BytesIO()
-        prs.save(output_io)
-        output_io.seek(0)  # Reset vị trí đọc
-        # Giữ nguyên tên file gốc, thêm hậu tố "_translated"
-        translated_filename = original_ppt.filename.replace(".xml", ".pptx")
-        file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
-        print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
-        return file_id
-    except Exception as e:
-        print(f"Error creating translated PowerPoint: {str(e)}")
-        return None

powerpoint/pptx_processor.py DELETED Viewed

@@ -1,50 +0,0 @@
-    # ppt_processor.py
-from pathlib import Path
-from xml_handling import ppt_to_xml, translate_xml_file
-from pptx_object import create_translated_ppt
-import os
-def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
-    """Process a single PPT/PPTX file from XML extraction to final translation."""
-    ppt_path = ppt_path.strip("'\"")
-    ppt_path = ppt_path.replace("\\ ", " ")
-    ppt_path = ppt_path.replace("\\'", "'")
-    ppt_path = os.path.expanduser(ppt_path)
-    ppt_path = Path(ppt_path).resolve()
-    # chuyển thành link DB trên server
-    try:
-        if not ppt_path.is_file():
-            print(f"Error: '{ppt_path}' is not a valid file.")
-            return
-        if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
-            print(f"Error: '{ppt_path}' is not a PowerPoint file.")
-            return
-        base_dir = ppt_path.parent
-        # Original XML
-        print(f"Generating original XML for {ppt_path.name}...")
-        original_xml = ppt_to_xml(str(ppt_path))
-        if original_xml:
-            original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
-            with open(original_output_path, 'w', encoding='utf-8') as f:
-                f.write(original_xml)
-            print(f"Original XML saved: {original_output_path}")
-            # Save original XML to MongoDB
-            # save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
-        # Translated XML
-        print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
-        translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
-        original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
-        translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
-        # Create Translated PPT
-        print(f"Creating translated PPT for {ppt_path.name}...")
-        output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
-        output_ppt_path = base_dir / output_filename
-        create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
-    except Exception as e:
-        print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")