Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Mar 19

Commit

0e9ff78

1 Parent(s): 95bd308

Add application file

Browse files

Files changed (13) hide show

.env +1 -0
README.md +2 -12
db/mongodb.py +194 -0
excel/excel_translate.py +174 -0
home.py +22 -0
pages/upload.py +134 -0
powerpoint/__init__.py +0 -0
powerpoint/pptx_object.py +357 -0
powerpoint/pptx_processor.py +50 -0
powerpoint/xml_handling.py +368 -0
test.ipynb +0 -0
translate/translator.py +64 -0
word/word_translate.py +246 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GEMINI_API_KEY = AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg

README.md CHANGED Viewed

@@ -1,13 +1,3 @@
----
-title: MT Deploy
-emoji: 🐠
-colorFrom: green
-colorTo: green
-sdk: streamlit
-sdk_version: 1.43.2
-app_file: app.py
-pinned: false
-short_description: deploy Machine Translation
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Machine-Translation










2
3	+ - Link drive: https://drive.google.com/drive/folders/19htOXYBz88eNIWU0-_3xEn1JRU-JaIvW?usp=drive_link

db/mongodb.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from pymongo import MongoClient
+import gridfs
+from bson import ObjectId
+import os
+from io import BytesIO
+import magic
+def connect_mongodb(db_name, collection_name):
+    client = MongoClient("mongodb://localhost:27017")
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection=collection_name)
+    return fs
+def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file", file_name=None, file_tail=".pptx"):
+    """
+    Lưu file PowerPoint (pptx) vào MongoDB bằng GridFS
+    nhưng không lưu nếu tên file đã tồn tại.
+    :param uploaded_file: đối tượng UploadedFile từ Streamlit
+    :param db_name: Tên database trong MongoDB
+    :param collection_name: Tên collection GridFS
+    :param file_name: Tên file muốn lưu (không cần .pptx). Nếu để None, lấy tên gốc.
+    :return: file_id nếu lưu thành công, None nếu file đã tồn tại
+    """
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection=collection_name)
+    # Xác định tên file
+    if not file_name:
+        # Lấy tên file từ uploaded_file (VD: "slide.pptx")
+        file_name = uploaded_file.name
+    else:
+        # Nếu người dùng chỉ truyền tên, thêm .pptx nếu chưa có
+        if not file_name.endswith(file_tail):
+            file_name = file_name + file_tail
+    # Kiểm tra file đã tồn tại trong MongoDB chưa
+    existing_file = fs.find_one({"filename": file_name})
+    if existing_file:
+        print(f"⚠️ File '{file_name}' đã tồn tại trong MongoDB. Không lưu lại. Xin vui lòng đổi tên.")
+        client.close()
+        return None
+    # Đảm bảo con trỏ file đang ở đầu
+    uploaded_file.seek(0)
+    file_bytes = uploaded_file.read()
+    # Lưu nội dung file (bytes) vào MongoDB
+    file_id = fs.put(file_bytes, filename=file_name)
+    print(f"✅ File '{file_name}' đã được lưu vào '{collection_name}' với ID: {file_id}")
+    client.close()
+    return file_id
+def delete_pptx_from_mongodb(file_id, db_name="ppt", collection_name="root_file"):
+    """
+    Xóa file PowerPoint khỏi MongoDB theo ID.
+    :param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)
+    :param db_name: Tên database trong MongoDB
+    :param collection_name: Tên collection GridFS
+    """
+    # Kết nối đến MongoDB
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection=collection_name)
+    try:
+        # Chuyển đổi ID nếu cần
+        if not isinstance(file_id, ObjectId):
+            file_id = ObjectId(file_id)
+        # Kiểm tra file có tồn tại không
+        if fs.exists(file_id):
+            fs.delete(file_id)
+            print(f"✅ Đã xóa file với ID: {file_id}")
+        else:
+            print(f"⚠️ Không tìm thấy file với ID: {file_id}")
+    except Exception as e:
+        print(f"❌ Lỗi khi xóa file: {e}")
+    client.close()
+def download_pptx_from_mongodb(file_id, save_path, save_name, db_name="ppt", collection_name="root_file"):
+    """
+    Tải file PowerPoint từ MongoDB GridFS và lưu về máy.
+    :param file_id:       ID của file cần tải (dạng chuỗi hoặc ObjectId)
+    :param save_path:     Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')
+    :param save_name:     Tên file khi lưu (VD: 'my_presentation.pptx')
+    :param db_name:       Tên database trong MongoDB (mặc định: 'ppt')
+    :param collection_name: Tên collection GridFS (mặc định: 'root_file')
+    """
+    # Đảm bảo thư mục lưu file tồn tại
+    os.makedirs(save_path, exist_ok=True)
+    # Tạo đường dẫn đầy đủ cho file
+    full_file_path = os.path.join(save_path, save_name)
+    # Kết nối đến MongoDB
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection=collection_name)
+    try:
+        # Chuyển đổi ID nếu cần
+        if not isinstance(file_id, ObjectId):
+            file_id = ObjectId(file_id)
+        # Lấy dữ liệu file từ GridFS
+        file_data = fs.get(file_id)
+        # Ghi dữ liệu ra file
+        with open(full_file_path, "wb") as f:
+            f.write(file_data.read())
+        print(f"✅ File đã được tải về: {full_file_path}")
+    except Exception as e:
+        print(f"❌ Lỗi khi tải file: {e}")
+    finally:
+        client.close()
+def save_xml_to_gridfs(xml_content, file_name, db_name="ppt", collection_name="original_xml"):
+    """
+    Lưu XML vào MongoDB GridFS.
+    :param xml_content: Chuỗi XML cần lưu
+    :param file_name: Tên file XML
+    :param db_name: Tên database MongoDB
+    :param collection_name: Tên collection GridFS
+    """
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection=collection_name)
+    # Kiểm tra file đã tồn tại chưa
+    existing_file = fs.find_one({"filename": file_name})
+    if existing_file:
+        print(f"⚠️ File '{file_name}' đã tồn tại trong GridFS. Không lưu lại.")
+        return
+    # Chuyển đổi chuỗi XML thành bytes và lưu vào GridFS
+    file_id = fs.put(xml_content.encode("utf-8"), filename=file_name)
+    print(f"✅ XML '{file_name}' đã được lưu vào GridFS với ID: {file_id}")
+def fetch_file_from_mongodb(db_name, collection_name, file_id):
+    client = MongoClient("mongodb://localhost:27017/")  # Cập nhật nếu cần
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection_name)
+    try:
+        file_data = fs.get(file_id)
+        pptx_io = BytesIO(file_data.read())
+        pptx_io.seek(0)  # Đặt lại vị trí đầu file
+        return pptx_io, file_data.filename
+    except Exception as e:
+        print(f"Lỗi khi lấy file từ MongoDB: {e}")
+        return None, None
+def detect_file_type(uploaded_file):
+    if uploaded_file is not None:
+        try:
+            file_bytes = uploaded_file.read(4096)  # Đọc nhiều bytes hơn để nhận diện MIME
+            mime = magic.Magic(mime=True)
+            file_type = mime.from_buffer(file_bytes)
+        except Exception as e:
+            print(f"Error detecting file type: {e}")
+            file_type = "Unknown"
+        # Danh sách MIME types phổ biến
+        mime_types = {
+            "application/pdf": "PDF",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation": "PPTX",
+            "application/vnd.ms-powerpoint": "PPTX",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel",
+            "application/vnd.ms-excel": "Excel",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word",
+            "application/msword": "Word",
+            "text/csv": "CSV",
+            "text/plain": "CSV"  # Một số file CSV có thể nhận diện là text/plain
+        }
+        detected_type = mime_types.get(file_type, "Unknown")
+        # Nếu vẫn không chắc, kiểm tra phần mở rộng file
+        if detected_type == "Unknown":
+            ext = os.path.splitext(uploaded_file.name)[1].lower()
+            ext_mapping = {".csv": "CSV", ".docx": "Word", ".doc": "Word", ".xlsx": "Excel", ".pptx": "PPTX", ".pdf": "PDF"}
+            detected_type = ext_mapping.get(ext, "Unknown")
+        return detected_type
+    return None

excel/excel_translate.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import xlwings as xw
+from typing import Dict, List
+from translate.translator import translate_text_dict
+import math
+import chardet
+import io
+import pandas as pd
+import pymongo
+import gridfs
+from io import BytesIO
+import tempfile
+import os
+def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en', target_lang: str = "fr", gemini_api: str = "", db_name: str = "excel"):
+    # Kết nối MongoDB
+    client = pymongo.MongoClient("mongodb://localhost:27017")
+    db = client[db_name]
+    fs_input = gridfs.GridFS(db, collection="root_file")
+    fs_output = gridfs.GridFS(db, collection="final_file")
+    # Tải file từ MongoDB
+    file_data = fs_input.get(file_id).read()
+    # Lưu file tạm thời
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
+        temp_file.write(file_data)
+        temp_file_path = temp_file.name
+    # Khởi tạo xlwings (chạy ẩn ứng dụng Excel)
+    app = xw.App(visible=False)
+    wb = xw.Book(temp_file_path)  # Mở workbook từ file tạm
+    # Chọn sheet được chỉ định hoặc tất cả các sheet
+    sheets = [wb.sheets[sheet_name]] if sheet_name else wb.sheets
+    for sheet in sheets:
+        last_row = sheet.used_range.rows.count
+        last_col = sheet.used_range.columns.count
+        # Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
+        text_dict: Dict[str, List[str]] = {}
+        cell_map: Dict[str, any] = {}  # lưu mapping key -> cell object
+        for row in range(1, last_row + 1):
+            for col in range(1, last_col + 1):
+                cell = sheet.cells[row, col]
+                if isinstance(cell.value, str):
+                    key = f"R{row}C{col}"  # key theo dạng R{row}C{col}
+                    text_dict[key] = [cell.value]  # Lưu giá trị dưới dạng danh sách với 1 phần tử
+                    cell_map[key] = cell
+        # Gọi hàm dịch theo dạng bulk
+        translated_dict = translate_text_dict(text_dict, source_lang=from_lang, target_lang=target_lang, gemini_api=gemini_api)
+        # Cập nhật lại các cell với nội dung đã dịch
+        for key, cell in cell_map.items():
+            if key in translated_dict:
+                translated_text_list = translated_dict[key]
+                if translated_text_list and len(translated_text_list) > 0:
+                    cell.value = translated_text_list[0]
+    # Lưu workbook vào file tạm thời
+    wb.save(temp_file_path)
+    wb.close()
+    app.quit()
+    # Đọc lại file tạm để lưu vào MongoDB
+    with open(temp_file_path, "rb") as f:
+        translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.xlsx")
+    # Xóa file tạm
+    os.remove(temp_file_path)
+    print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
+    return translated_file_id
+def read_csv_with_auto_encoding(csv_path):
+    # Đọc file dưới dạng nhị phân
+    with open(csv_path, "rb") as f:
+        raw_data = f.read()
+        # Dò tìm encoding
+        detect_result = chardet.detect(raw_data)
+        encoding = detect_result["encoding"]
+        confidence = detect_result["confidence"]
+        print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
+        # Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
+        if encoding is None:
+            encoding = "utf-8"
+    decoded_data = raw_data.decode(encoding, errors='replace')
+    # Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
+    csv_data = io.StringIO(decoded_data)
+    df = pd.read_csv(csv_data)
+    return df
+def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
+    # Kết nối MongoDB
+    client = pymongo.MongoClient("mongodb://localhost:27017")
+    db = client[db_name]
+    fs_input = gridfs.GridFS(db, collection="root_file")
+    fs_output = gridfs.GridFS(db, collection="final_file")
+    # Tải file từ MongoDB
+    file_data = fs_input.get(file_id).read()
+    # Lưu file tạm thời
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
+        temp_file.write(file_data)
+        temp_file_path = temp_file.name
+    df = read_csv_with_auto_encoding(temp_file_path)
+    # If text_columns is not specified, we assume we want to translate everything that looks like text.
+    # Otherwise, only translate the given columns.
+    if text_columns is None:
+        # Example heuristic: choose all object/string columns
+        text_columns = df.select_dtypes(include=["object"]).columns.tolist()
+    num_rows = len(df)
+    num_chunks = math.ceil(num_rows / chunk_size)
+    translated_df = df.copy()  # copy to store the final translations
+    for chunk_index in range(num_chunks):
+        start_idx = chunk_index * chunk_size
+        end_idx = min((chunk_index + 1) * chunk_size, num_rows)
+        chunk_df = df.iloc[start_idx:end_idx]
+        # Build a dictionary structure. For example, row-based:
+        # {
+        #   "0": {"colA": "some text", "colB": "some text"},
+        #   "1": {"colA": "some text", "colB": "some text"},
+        #   ...
+        # }
+        chunk_dict = {}
+        for i, row in chunk_df.iterrows():
+            row_dict = {}
+            for col in text_columns:
+                row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
+            chunk_dict[str(i)] = row_dict
+        # Now call your LLM translator on this dictionary
+        translated_chunk = translate_text_dict(
+            text_dict=chunk_dict,
+            source_lang=source_lang,
+            target_lang=target_lang,
+            gemini_api=gemini_api
+        )
+        # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
+        for i_str, row_data in translated_chunk.items():
+            i = int(i_str)
+            for col, translated_val in row_data.items():
+                translated_df.at[i, col] = translated_val
+    # Lưu file dịch vào tệp tạm thời
+    translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
+    translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
+    # Đọc lại file tạm để lưu vào MongoDB
+    with open(translated_file_path, "rb") as f:
+        translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
+    # Xóa file tạm
+    os.remove(temp_file_path)
+    os.remove(translated_file_path)
+    print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
+    return translated_file_id

home.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+st.title("Some Streamlit Demo, maybe")
+st.sidebar.header("Input")
+num_rows = st.sidebar.slider("Number of rows", min_value=10, max_value=100, value=20)
+num_cols = st.sidebar.slider("Number of columns", min_value=2, max_value=10, value=3)
+data = np.random.randn(num_rows, num_cols)
+columns = [f"Column {i+1}" for i in range(num_cols)]
+df = pd.DataFrame(data, columns=columns)
+st.subheader("Generated Data Table")
+st.dataframe(df)
+st.subheader("Line Chart of the Data")
+st.line_chart(df)
+st.subheader("Statistics")
+st.write(df.describe())

pages/upload.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+import google.generativeai as genai
+from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
+from powerpoint.xml_handling import (
+    extract_text_from_xml, update_xml_with_translated_text_mongodb, ppt_to_xml_mongodb
+)
+from translate.translator import translate_text_dict
+from powerpoint.pptx_object import create_translated_ppt
+from excel.excel_translate import translate_xlsx, translate_csv
+from word.word_translate import translate_docx
+import dotenv
+import os
+dotenv.load_dotenv(".env")
+# Cấu hình API key
+api_key = os.getenv("GEMINI_API_KEY")
+genai.configure(api_key=api_key)
+model = genai.GenerativeModel("gemini-1.5-flash")
+# Giao diện Streamlit
+st.title("Upload PPTX to MongoDB")
+uploaded_file = st.file_uploader("Chọn file PPTX", type=["pptx, excel, csv, docx"])
+file_name_input = st.text_input("Tên file để lưu (không cần .pptx)", value="")
+final_pptx_id = None  # Biến lưu ID file sau khi xử lý
+if uploaded_file is not None:
+    if st.button("Upload"):
+        file_type = detect_file_type(uploaded_file)
+        st.write(f"Detected file type: {file_type}")
+        if file_type == "PPTX":
+            file_id = save_file_to_mongodb(uploaded_file=uploaded_file, file_name=file_name_input)
+            st.write(f"File ID: {file_id}")
+            xml_file_id = ppt_to_xml_mongodb(file_id)
+            text_dict = extract_text_from_xml(file_id=xml_file_id)
+            translated_dict = translate_text_dict(text_dict, source_lang="VietNamese", target_lang="English", gemini_api=api_key)
+            final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
+            st.write(f"Final XML ID: {final_xml_id}")
+            # Lưu ID file PPTX cuối cùng
+            final_pptx_id = create_translated_ppt(
+                db_name="ppt", original_ppt_id=file_id,
+                translated_xml_id=final_xml_id, output_collection="final_pptx"
+            )
+            st.write(f"Final PPTX ID: {final_pptx_id}")
+            # Hiển thị ảnh slide trước khi tải xuống
+            if final_pptx_id:
+                st.write("✅ File đã sẵn sàng để tải xuống!")
+                pptx_io, pptx_filename = fetch_file_from_mongodb("ppt", "final_pptx", final_pptx_id)
+                if pptx_io:
+                    # Nút tải file sau khi xem trước
+                    st.download_button(
+                        label="Click to Download",
+                        data=pptx_io.getvalue(),  # Chuyển thành bytes để tải về
+                        file_name=pptx_filename,
+                        mime="application/vnd.openxmlformats-officedocument.presentationml.presentation"
+                    )
+                else:
+                    st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
+        elif file_type == "Excel":
+            file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="excel", collection_name="root_file", file_name=file_name_input, file_tail=".xlsx")
+            st.write(f"File ID: {file_id}")
+            final_id = translate_xlsx(file_id=file_id, from_lang="en", target_lang="vi", gemini_api=api_key)
+            st.write(f"Final Excel ID: {final_id}")
+            if final_id:
+                st.write("✅ File đã sẵn sàng để tải xuống!")
+                excel_io, excel_filename = fetch_file_from_mongodb("excel", "final_file", final_id)
+                if excel_io:
+                    st.download_button(
+                        label="Click to Download",
+                        data=excel_io.getvalue(),
+                        file_name=excel_filename,
+                        mime="application/vnd.ms-excel"
+                    )
+                else:
+                    st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
+        elif file_type == "CSV":
+            file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="csv", collection_name="root_file", file_name=file_name_input, file_tail=".csv")
+            st.write(f"File ID: {file_id}")
+            final_id = translate_csv(file_id=file_id, source_lang="en", target_lang="vi", gemini_api=api_key)
+            st.write(f"Final CSV ID: {final_id}")
+            if final_id:
+                st.write("✅ File đã sẵn sàng để tải xuống!")
+                csv_io, csv_filename = fetch_file_from_mongodb("csv", "final_file", final_id)
+                if csv_io:
+                    st.download_button(
+                        label="Click to Download",
+                        data=csv_io.getvalue(),
+                        file_name=csv_filename,
+                        mime="text/csv"
+                    )
+                else:
+                    st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
+        elif file_type == "Word":
+            file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file", file_name=file_name_input, file_tail=".docx")
+            st.write(f"File ID: {file_id}")
+            final_id = translate_docx(file_id=file_id, source_lang="en", target_lang="vi")
+            st.write(f"Final CSV ID: {final_id}")
+            if final_id:
+                st.write("✅ File đã sẵn sàng để tải xuống!")
+                docx_io, docx_filename = fetch_file_from_mongodb("word", "final_file", final_id)
+                if docx_io:
+                    st.download_button(
+                        label="Click to Download",
+                        data=docx_io.getvalue(),
+                        file_name=docx_filename,
+                        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                    )
+                else:
+                    st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")

powerpoint/__init__.py ADDED Viewed

File without changes

powerpoint/pptx_object.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# ppt_objects.py
+from pptx import Presentation
+from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+import xml.etree.ElementTree as ET
+from pptx.util import Pt
+from pptx.dml.color import RGBColor
+import re
+import json
+from pymongo import MongoClient
+from gridfs import GridFS
+import json
+import xml.etree.ElementTree as ET
+from io import BytesIO
+def apply_group_properties_recursive(shape, shape_index, parent_element):
+    """Recursively applies properties to shapes within groups."""
+    if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+        group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
+        if group_element is not None:
+            for i, sub_shape in enumerate(shape.shapes):
+                apply_group_properties_recursive(sub_shape, i, group_element)
+                # Apply properties for sub-shapes WITHIN the group, based on their type.
+                if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                    table_element = group_element.find(f".//table_element[@shape_index='{i}']")
+                    if table_element:  # Use a shorter name for clarity
+                        props_element = table_element.find("properties")
+                        if props_element is not None and props_element.text:
+                            try:
+                                table_data = json.loads(props_element.text)
+                                apply_table_properties(sub_shape.table, table_data)
+                            except (json.JSONDecodeError, KeyError) as e:
+                                print(f"Error applying table properties (in group): {str(e)}")
+                elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
+                    text_element = group_element.find(f".//text_element[@shape_index='{i}']")
+                    if text_element:  # Shorter name
+                        props_element = text_element.find("properties")
+                        if props_element is not None and props_element.text:
+                            try:
+                                shape_data = json.loads(props_element.text)
+                                apply_shape_properties(sub_shape, shape_data)
+                            except (json.JSONDecodeError, KeyError) as e:
+                                print(f"Error applying shape properties (in group): {str(e)}")
+def get_alignment_value(alignment_str):
+    """Convert alignment string (with extra characters) to PP_ALIGN enum value."""
+    alignment_map = {
+        'center': PP_ALIGN.CENTER,
+        'left': PP_ALIGN.LEFT,
+        'right': PP_ALIGN.RIGHT,
+        'justify': PP_ALIGN.JUSTIFY
+    }
+    match = re.match(r"([A-Za-z]+)", alignment_str)
+    return alignment_map.get(match.group(1).lower()) if match else None
+def get_vertical_anchor(value):
+    """Converts vertical_anchor string to MSO_ANCHOR enum."""
+    mapping = {
+        "TOP": MSO_ANCHOR.TOP,
+        "MIDDLE": MSO_ANCHOR.MIDDLE,
+        "BOTTOM": MSO_ANCHOR.BOTTOM
+    }
+    return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
+def get_table_properties(table):
+    """Extract complete table properties."""
+    table_data = {
+        'rows': len(table.rows),
+        'cols': len(table.columns),
+        'cells': []
+    }
+    for row in table.rows:
+        row_data = []
+        for cell in row.cells:
+            cell_data = {
+                'text': cell.text.strip(),
+                'font_size': None,
+                'font_name': None,
+                'alignment': None,
+                'margin_left': cell.margin_left,
+                'margin_right': cell.margin_right,
+                'margin_top': cell.margin_top,
+                'margin_bottom': cell.margin_bottom,
+                'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
+                'font_color': None
+            }
+            if cell.text_frame.paragraphs:
+                paragraph = cell.text_frame.paragraphs[0]
+                if paragraph.runs:
+                    run = paragraph.runs[0]
+                    if hasattr(run.font, 'size') and run.font.size is not None:
+                        cell_data['font_size'] = run.font.size.pt
+                    if hasattr(run.font, 'name'):
+                        cell_data['font_name'] = run.font.name
+                    if hasattr(run.font, 'bold'):
+                        cell_data['bold'] = run.font.bold
+                    if hasattr(run.font, 'italic'):
+                        cell_data['italic'] = run.font.italic
+                    if (hasattr(run.font, 'color') and
+                        run.font.color is not None and
+                        hasattr(run.font.color, 'rgb') and
+                        run.font.color.rgb is not None):
+                        cell_data['font_color'] = str(run.font.color.rgb)
+                if hasattr(paragraph, 'alignment'):
+                    cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
+            row_data.append(cell_data)
+        table_data['cells'].append(row_data)
+    return table_data
+def get_shape_properties(shape):
+    """Extract all properties from a shape, with detailed debug prints."""
+    shape_data = {
+        'text': '',
+        'font_size': None,
+        'font_name': None,
+        'alignment': None,
+        'width': shape.width,
+        'height': shape.height,
+        'left': shape.left,
+        'top': shape.top,
+        'bold': None,
+        'italic': None,
+        'line_spacing_info': {
+            'rule': None,
+            'value': None
+        },
+        'space_before': None,
+        'space_after': None,
+        'font_color': None
+    }
+    if hasattr(shape, "text"):
+        shape_data['text'] = shape.text.strip()
+        if hasattr(shape, 'text_frame'):
+            for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
+                if paragraph.runs:
+                    run = paragraph.runs[0]  # Assuming properties are mostly consistent in the first run
+                    if hasattr(run.font, 'size') and run.font.size is not None:
+                        shape_data['font_size'] = run.font.size.pt
+                    if hasattr(run.font, 'name'):
+                        shape_data['font_name'] = run.font.name
+                    if hasattr(run.font, 'bold'):
+                        shape_data['bold'] = run.font.bold
+                    if hasattr(run.font, 'italic'):
+                        shape_data['italic'] = run.font.italic
+                    if (hasattr(run.font, 'color') and
+                        run.font.color is not None and
+                        hasattr(run.font.color, 'rgb') and
+                        run.font.color.rgb is not None):
+                        shape_data['font_color'] = str(run.font.color.rgb)
+                if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
+                    shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
+                if hasattr(paragraph, 'space_before'):
+                    shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
+                if hasattr(paragraph, 'space_after'):
+                    shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
+                if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
+                    line_spacing = paragraph.line_spacing
+                    # Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
+                    if isinstance(line_spacing, Pt) or line_spacing > 10:
+                        line_spacing_rule = "EXACTLY"
+                    elif isinstance(line_spacing, float):
+                        line_spacing_rule = "MULTIPLE"
+                    else:
+                        line_spacing_rule = "UNKNOWN"
+                    shape_data['line_spacing_info'] = {
+                        'rule': line_spacing_rule,
+                        'value': line_spacing if isinstance(line_spacing, float) else None
+                    }
+    return shape_data
+def apply_shape_properties(shape, shape_data):
+    """Apply saved properties to a shape."""
+    try:
+        shape.width = shape_data['width']
+        shape.height = shape_data['height']
+        shape.left = shape_data['left']
+        shape.top = shape_data['top']
+        shape.text = ""
+        paragraph = shape.text_frame.paragraphs[0]
+        run = paragraph.add_run()
+        run.text = shape_data['text']
+        if shape_data['font_size']:
+            adjusted_size = shape_data['font_size'] * 0.9
+            run.font.size = Pt(adjusted_size)
+        if shape_data.get('font_name'):
+            run.font.name = shape_data['font_name']
+        else:
+            run.font.name = "Arial"
+        if shape_data.get('font_color'):
+            run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
+        if shape_data['bold'] is not None:
+            run.font.bold = shape_data['bold']
+        if shape_data['italic'] is not None:
+            run.font.italic = shape_data['italic']
+        if shape_data['alignment']:
+            paragraph.alignment = get_alignment_value(shape_data['alignment'])
+        line_spacing_info = shape_data.get('line_spacing_info', {})
+        line_spacing_rule = line_spacing_info.get('rule')
+        line_spacing_value = line_spacing_info.get('value')
+        if line_spacing_rule and line_spacing_value is not None:
+            if line_spacing_rule == "EXACTLY":
+                paragraph.line_spacing = Pt(line_spacing_value)
+            elif line_spacing_rule == "AT_LEAST":
+                paragraph.line_spacing = Pt(line_spacing_value)
+            elif line_spacing_rule == "MULTIPLE":
+                paragraph.line_spacing = line_spacing_value
+            else:
+                print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
+        if shape_data['space_before']:
+            paragraph.space_before = shape_data['space_before']
+        if shape_data['space_after']:
+            paragraph.space_after = shape_data['space_after']
+    except Exception as e:
+        print(f"Error applying shape properties: {str(e)}")
+def apply_table_properties(table, table_data):
+    """Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
+    for row_idx, row in enumerate(table.rows):
+        for col_idx, cell in enumerate(row.cells):
+            try:
+                cell_data = table_data['cells'][row_idx][col_idx]
+                # Áp dụng margin
+                cell.margin_left = cell_data.get('margin_left', 0)
+                cell.margin_right = cell_data.get('margin_right', 0)
+                cell.margin_top = cell_data.get('margin_top', 0)
+                cell.margin_bottom = cell_data.get('margin_bottom', 0)
+                # Áp dụng vertical_anchor (tránh dùng eval)
+                if 'vertical_anchor' in cell_data:
+                    cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
+                # Xóa nội dung cũ và thiết lập văn bản mới
+                cell.text = ""
+                paragraph = cell.text_frame.paragraphs[0]
+                run = paragraph.add_run()
+                run.text = cell_data.get('text', "")
+                # Thiết lập kích thước font
+                if 'font_size' in cell_data:
+                    adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
+                    run.font.size = Pt(adjusted_size)
+                # Thiết lập font chữ
+                run.font.name = cell_data.get('font_name', 'Arial')
+                # Màu chữ
+                if 'font_color' in cell_data:
+                    run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
+                # In đậm & in nghiêng
+                run.font.bold = cell_data.get('bold', False)
+                run.font.italic = cell_data.get('italic', False)
+                # Căn lề văn bản
+                if 'alignment' in cell_data:
+                    paragraph.alignment = get_alignment_value(cell_data['alignment'])
+            except Exception as e:
+                print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
+def get_file_from_mongodb(db_name, collection_name, file_id):
+    """Tải tệp từ MongoDB GridFS"""
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = GridFS(db, collection_name)
+    file_data = fs.get(file_id)
+    return BytesIO(file_data.read())
+def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
+    """Lưu tệp vào MongoDB GridFS"""
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = GridFS(db, collection_name)
+    file_id = fs.put(file_data, filename=file_name)
+    return file_id
+def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
+    """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
+    try:
+        # Kết nối MongoDB và tải file
+        original_ppt_io = get_file_from_mongodb(db_name, "root_file", original_ppt_id)
+        translated_xml_io = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
+        # Load PowerPoint gốc và XML dịch
+        prs = Presentation(original_ppt_io)
+        tree = ET.parse(translated_xml_io)
+        root = tree.getroot()
+        # Áp dụng bản dịch
+        for slide_number, slide in enumerate(prs.slides, 1):
+            xml_slide = root.find(f".//slide[@number='{slide_number}']")
+            if xml_slide is None:
+                continue
+            for shape_index, shape in enumerate(slide.shapes):
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    apply_group_properties_recursive(shape, shape_index, xml_slide)
+                elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                    table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
+                    if table_element is not None:
+                        props_element = table_element.find("properties")
+                        if props_element is not None and props_element.text:
+                            try:
+                                table_data = json.loads(props_element.text)
+                                apply_table_properties(shape.table, table_data)
+                            except Exception as e:
+                                print(f"Error applying table properties: {str(e)}")
+                elif hasattr(shape, "text"):
+                    text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
+                    if text_element is not None:
+                        props_element = text_element.find("properties")
+                        if props_element is not None and props_element.text:
+                            try:
+                                shape_data = json.loads(props_element.text)
+                                apply_shape_properties(shape, shape_data)
+                            except Exception as e:
+                                print(f"Error applying shape properties: {str(e)}")
+        # Lưu PowerPoint vào MongoDB
+        output_io = BytesIO()
+        prs.save(output_io)
+        output_io.seek(0)  # Reset vị trí đọc
+        file_id = save_file_to_mongodb(db_name, output_collection, "translated_presentation.pptx", output_io)
+        print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
+        return file_id
+    except Exception as e:
+        print(f"Error creating translated PowerPoint: {str(e)}")
+        return None
+def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
+    """Lưu tệp vào MongoDB GridFS"""
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = GridFS(db, collection_name)
+    file_id = fs.put(file_data, filename=file_name)
+    return file_id

powerpoint/pptx_processor.py ADDED Viewed

	@@ -0,0 +1,50 @@

+    # ppt_processor.py
+from pathlib import Path
+from xml_handling import ppt_to_xml, translate_xml_file
+from pptx_object import create_translated_ppt
+import os
+def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
+    """Process a single PPT/PPTX file from XML extraction to final translation."""
+    ppt_path = ppt_path.strip("'\"")
+    ppt_path = ppt_path.replace("\\ ", " ")
+    ppt_path = ppt_path.replace("\\'", "'")
+    ppt_path = os.path.expanduser(ppt_path)
+    ppt_path = Path(ppt_path).resolve()
+    # chuyển thành link DB trên server
+    try:
+        if not ppt_path.is_file():
+            print(f"Error: '{ppt_path}' is not a valid file.")
+            return
+        if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
+            print(f"Error: '{ppt_path}' is not a PowerPoint file.")
+            return
+        base_dir = ppt_path.parent
+        # Original XML
+        print(f"Generating original XML for {ppt_path.name}...")
+        original_xml = ppt_to_xml(str(ppt_path))
+        if original_xml:
+            original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
+            with open(original_output_path, 'w', encoding='utf-8') as f:
+                f.write(original_xml)
+            print(f"Original XML saved: {original_output_path}")
+            # Save original XML to MongoDB
+            # save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
+        # Translated XML
+        print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
+        translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
+        original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
+        translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
+        # Create Translated PPT
+        print(f"Creating translated PPT for {ppt_path.name}...")
+        output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
+        output_ppt_path = base_dir / output_filename
+        create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
+    except Exception as e:
+        print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")

powerpoint/xml_handling.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+import json
+from typing import Dict, List
+from concurrent.futures import ThreadPoolExecutor
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+from powerpoint.pptx_object import get_table_properties, get_shape_properties
+from pymongo import MongoClient
+import gridfs
+from bson import ObjectId
+from io import BytesIO
+gemini_api = "AIzaSyDtBIjTSfbvuEsobNwjtdyi9gVpDrCaWPM"
+def extract_text_from_group(group_shape, slide_number, shape_index, slide_element):
+    """Extracts text from shapes within a group, only adding the group if it contains text."""
+    group_element = ET.SubElement(slide_element, "group_element")
+    group_element.set("shape_index", str(shape_index))
+    group_element.set("group_name", group_shape.name)  # Add group name
+    group_has_text = False  # Flag to track if the group contains any text
+    for i, shape in enumerate(group_shape.shapes):
+        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+            # Recursively check nested groups, and update group_has_text
+            if extract_text_from_group(shape, slide_number, i, group_element):
+                group_has_text = True
+        elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+            table_element = ET.SubElement(group_element, "table_element")
+            table_element.set("shape_index", str(i))
+            table_data = get_table_properties(shape.table)
+            props_element = ET.SubElement(table_element, "properties")
+            props_element.text = json.dumps(table_data, indent=2)
+            group_has_text = True
+        elif hasattr(shape, "text_frame") and shape.text_frame:
+            text_element = ET.SubElement(group_element, "text_element")
+            text_element.set("shape_index", str(i))
+            shape_data = get_shape_properties(shape)
+            props_element = ET.SubElement(text_element, "properties")
+            props_element.text = json.dumps(shape_data, indent=2)
+            if shape_data.get("text") or (
+                "paragraphs" in shape_data
+                and any(p.get("text") for p in shape_data["paragraphs"])
+            ):
+                group_has_text = True
+    # Only keep the group element if it contains text
+    if not group_has_text:
+        slide_element.remove(group_element)
+        return False
+    return True
+def extract_text_from_slide(slide, slide_number, translate=False):
+    """Extract all text elements from a slide."""
+    slide_element = ET.Element("slide")
+    slide_element.set("number", str(slide_number))
+    for shape_index, shape in enumerate(slide.shapes):
+        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+            extract_text_from_group(shape, slide_number, shape_index, slide_element)
+        elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+            table_element = ET.SubElement(slide_element, "table_element")
+            table_element.set("shape_index", str(shape_index))
+            table_data = get_table_properties(shape.table)
+            props_element = ET.SubElement(table_element, "properties")
+            props_element.text = json.dumps(table_data, indent=2)
+        elif hasattr(shape, "text"):
+            text_element = ET.SubElement(slide_element, "text_element")
+            text_element.set("shape_index", str(shape_index))
+            shape_data = get_shape_properties(shape)
+            props_element = ET.SubElement(text_element, "properties")
+            props_element.text = json.dumps(shape_data, indent=2)
+    return slide_element
+def ppt_to_xml_mongodb(ppt_file_id: str, db_name="ppt"):
+    """
+    Chuyển PowerPoint từ MongoDB thành XML và lưu vào MongoDB.
+    :param ppt_file_id: ID của file PPT gốc trong MongoDB (original_pptx)
+    :param db_name: Tên database MongoDB
+    :return: ID của file XML trong MongoDB (original_xml)
+    """
+    # Kết nối MongoDB
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs_ppt = gridfs.GridFS(db, collection="root_file")  # PPT gốc
+    fs_xml = gridfs.GridFS(db, collection="original_xml")  # XML lưu trữ
+    try:
+        # Lấy file PPT từ MongoDB
+        if not isinstance(ppt_file_id, ObjectId):
+            ppt_file_id = ObjectId(ppt_file_id)
+        ppt_file = fs_ppt.get(ppt_file_id)
+        prs = Presentation(BytesIO(ppt_file.read()))
+        # Tạo XML
+        root = ET.Element("presentation")
+        root.set("file_name", ppt_file.filename)
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            future_to_slide = {
+                executor.submit(extract_text_from_slide, slide, slide_number): slide_number
+                for slide_number, slide in enumerate(prs.slides, 1)
+            }
+            for future in future_to_slide:
+                slide_number = future_to_slide[future]
+                try:
+                    slide_element = future.result()
+                    root.append(slide_element)
+                except Exception as e:
+                    print(f"Error processing slide {slide_number}: {str(e)}")
+        xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
+        # Lưu XML vào MongoDB
+        xml_output = BytesIO(xml_str.encode("utf-8"))
+        xml_file_id = fs_xml.put(xml_output, filename=f"{ppt_file.filename}.xml")
+        print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
+        return xml_file_id
+    except Exception as e:
+        print(f"❌ Lỗi khi chuyển PPT sang XML: {str(e)}")
+        return None
+    finally:
+        client.close()
+def extract_text_from_xml(file_id=None, filename=None, db_name="ppt", collection_name="original_xml") -> Dict[str, List[str]]:
+    """
+    Tải XML từ MongoDB và trích xuất văn bản từ các slide.
+    :param file_id: ID của file trong MongoDB (dạng ObjectId hoặc string)
+    :param filename: Tên file cần tìm trong MongoDB (VD: "file.xml")
+    :param db_name: Tên database MongoDB
+    :param collection_name: Tên collection GridFS
+    :return: Dictionary {slide_number: [text1, text2, ...]}
+    """
+    # Kết nối MongoDB
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs = gridfs.GridFS(db, collection=collection_name)
+    try:
+        # Tìm file theo file_id hoặc filename
+        if file_id:
+            if not isinstance(file_id, ObjectId):
+                file_id = ObjectId(file_id)
+            file_data = fs.get(file_id)
+        elif filename:
+            file_data = fs.find_one({"filename": filename})
+            if not file_data:
+                print(f"❌ Không tìm thấy file '{filename}' trong MongoDB!")
+                return {}
+        else:
+            print("❌ Cần cung cấp 'file_id' hoặc 'filename' để tải file.")
+            return {}
+        # Đọc nội dung XML từ MongoDB
+        xml_content = file_data.read().decode("utf-8")
+        # print(f"✅ xml_content: {xml_content}")
+        # Chuyển đổi thành cây XML
+        root = ET.fromstring(xml_content)
+        slide_texts = {}
+        # Duyệt qua từng slide
+        for slide in root.findall("slide"):
+            slide_number = slide.get("number")
+            texts = []
+            # Helper function to extract text recursively
+            def extract_text_recursive(element):
+                if element.tag == "text_element":
+                    props = element.find("properties")
+                    if props is not None and props.text:
+                        try:
+                            shape_data = json.loads(props.text)
+                            # Handle both direct 'text' and paragraph-based text
+                            if 'text' in shape_data:
+                                texts.append(shape_data['text'])
+                            elif 'paragraphs' in shape_data:
+                                for paragraph in shape_data['paragraphs']:
+                                    if 'text' in paragraph:
+                                        texts.append(paragraph['text'])
+                                    #Also extract run level text
+                                    elif 'runs' in paragraph:
+                                        for run in paragraph['runs']:
+                                            if 'text' in run:
+                                                texts.append(run['text'])
+                        except json.JSONDecodeError:
+                            pass  # Ignore if JSON is invalid
+                elif element.tag == "table_element":
+                    props = element.find("properties")
+                    if props is not None and props.text:
+                        try:
+                            table_data = json.loads(props.text)
+                            for row in table_data.get("cells", []):
+                                for cell in row:
+                                    texts.append(cell.get("text", ""))
+                        except json.JSONDecodeError:
+                            pass  # Ignore if JSON is invalid
+                # Recursively process children of group_element
+                elif element.tag == "group_element":
+                    for child in element:
+                        extract_text_recursive(child)
+            # Iterate through all direct children of the slide
+            for child in slide:
+                extract_text_recursive(child)
+            slide_texts[str(slide_number)] = texts  # Ensure slide number is a string
+        print(slide_texts)
+        return slide_texts
+    except Exception as e:
+        print(f"❌ Lỗi khi xử lý XML: {e}")
+        return {}
+    finally:
+        client.close()
+def adjust_size(original_text, translated_text, data_container):
+    """Adjust font size if translated text is significantly longer."""
+    if not original_text or not translated_text:
+        return
+    original_len = len(original_text)
+    translated_len = len(translated_text)
+    length_ratio = translated_len / original_len if original_len >0 else 1 # Avoid division by 0
+    if length_ratio > 1.5:  # Adjust threshold as needed
+        if 'paragraphs' in data_container:
+            for paragraph in data_container['paragraphs']:
+                if 'runs' in paragraph:
+                    for run in paragraph['runs']:
+                        if run.get('font') and run['font'].get('size'):
+                            run['font']['size'] = max(6, int(run['font']['size'] * 0.8))
+        elif 'font' in data_container and data_container['font'].get('size'):
+            data_container['font']['size'] = max(6, int(data_container['font']['size'] * 0.8))
+def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[str, List[str]], db_name="ppt"):
+    """
+    Tải XML từ MongoDB (collection original_xml), cập nhật nội dung dịch, và lưu lại vào collection final_xml.
+    :param file_id: ID của file trong MongoDB (original_xml)
+    :param translated_dict: Dictionary {slide_number: [translated_text1, translated_text2, ...]}
+    :param db_name: Tên database MongoDB
+    """
+    # Kết nối MongoDB
+    client = MongoClient("mongodb://localhost:27017/")
+    db = client[db_name]
+    fs_original = gridfs.GridFS(db, collection="original_xml")  # Lấy file từ original_xml
+    fs_final = gridfs.GridFS(db, collection="final_xml")  # Lưu file vào final_xml
+    try:
+        # Tải file từ MongoDB (original_xml)
+        if not isinstance(file_id, ObjectId):
+            file_id = ObjectId(file_id)
+        file_data = fs_original.get(file_id)
+        xml_content = file_data.read().decode("utf-8")
+        # Chuyển đổi XML string thành cây XML
+        root = ET.fromstring(xml_content)
+        # Cập nhật nội dung dịch
+        for slide in root.findall("slide"):
+            slide_num = slide.get("number")
+            if slide_num in translated_dict:
+                translated_texts = translated_dict[slide_num]
+                text_index = 0  # Keep track of the current translated text
+                def update_element_recursive(element):
+                    nonlocal text_index  # Access and modify the outer scope's index
+                    if element.tag == "text_element":
+                        props = element.find("properties")
+                        if props is not None and props.text:
+                            try:
+                                shape_data = json.loads(props.text)
+                                original_text = ""
+                                # Handle direct text and paragraph-based text
+                                if 'text' in shape_data:
+                                    original_text = shape_data['text']
+                                    if text_index < len(translated_texts):
+                                         shape_data['text'] = translated_texts[text_index]
+                                         adjust_size(original_text, translated_texts[text_index], shape_data)
+                                         text_index += 1
+                                elif 'paragraphs' in shape_data:
+                                    for paragraph in shape_data['paragraphs']:
+                                        if 'text' in paragraph:
+                                            original_text = paragraph['text']
+                                            if text_index < len(translated_texts):
+                                                paragraph['text'] = translated_texts[text_index]
+                                                adjust_size(original_text, translated_texts[text_index], paragraph)
+                                                text_index += 1
+                                        elif 'runs' in paragraph:
+                                            for run in paragraph['runs']:
+                                                if 'text' in run:
+                                                    original_text = run['text']
+                                                    if text_index < len(translated_texts):
+                                                        run['text'] = translated_texts[text_index]
+                                                        adjust_size(original_text, translated_texts[text_index], run)
+                                                        text_index += 1
+                                props.text = json.dumps(shape_data, indent=2)
+                            except json.JSONDecodeError:
+                                print(f"JSONDecodeError in text_element on slide {slide_num}")
+                    elif element.tag == "table_element":
+                        props = element.find("properties")
+                        if props is not None and props.text:
+                            try:
+                                table_data = json.loads(props.text)
+                                for row in table_data.get("cells", []):
+                                    for cell in row:
+                                        original_text = cell.get('text', '')
+                                        if text_index < len(translated_texts):
+                                            cell['text'] = translated_texts[text_index]
+                                            adjust_size(original_text, translated_texts[text_index], cell)
+                                            text_index += 1
+                                props.text = json.dumps(table_data, indent=2)
+                            except json.JSONDecodeError:
+                                print(f"JSONDecodeError in table_element on slide {slide_num}")
+                    elif element.tag == "group_element":
+                        print("Group element found")
+                        for child in element:
+                            update_element_recursive(child)  # Recursively process children
+                # Start the recursive update from the slide's direct children
+                for child in slide:
+                    update_element_recursive(child)
+        # Chuyển XML thành chuỗi và làm đẹp định dạng
+        updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
+        # Lưu file cập nhật vào MongoDB (final_xml)
+        new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}_translated.xml")
+        print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
+        return new_file_id
+    except Exception as e:
+        print(f"❌ Lỗi khi cập nhật XML: {e}")
+        return None
+    finally:
+        client.close()

test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

translate/translator.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import json
+from typing import Dict, List
+from google import genai
+def translate_text_dict(text_dict: Dict[str, List[str]], source_lang: str, target_lang: str = "vi", gemini_api: str = "") -> Dict[str, List[str]]:
+    def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
+        """Translates a single batch of text."""
+        prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}
+        The text is in {source_lang}, with a chance of there being phrases in other languages as well.
+        Read through the entire dictionary, then translate the texts into {target_lang} so that the meaning is as close to the intended context as possible.
+        Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
+        Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.
+        Aim for brevity if possible so that the length of the translations match the length of the original texts, but prioritize accuracy above all .
+        Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
+        client = genai.Client(api_key=gemini_api)
+        response = client.models.generate_content(
+            model="gemini-2.0-flash", contents=prompt) # Use a model appropriate for your needs and API key.  gemini-2.0-flash doesn't exist.  1.5-pro is a good general-purpose model.
+        # Handle potential errors in the response, including rate limits and invalid JSON.
+        try:
+            # More robust JSON parsing:  Handle code blocks, markdown, and other variations.
+            response_text = response.text
+            start = response_text.find('{')
+            end = response_text.rfind('}') + 1
+            if start == -1 or end == -1:
+                 raise ValueError("Invalid JSON response from Gemini API: No object found.")
+            json_string = response_text[start:end]
+            trans_dict = json.loads(json_string)
+            return trans_dict
+        except (ValueError, json.JSONDecodeError) as e:
+            print(f"Error processing Gemini API response: {e}")
+            print(f"Raw response text: {response.text}") # Print the raw response for debugging
+            return {}  # Return an empty dict on error (or raise, depending on your needs)
+        except Exception as e:
+            print(f"An unexpected error occur: {e}")
+            return {}
+    batch_size = 30  # Adjust as needed, based on testing and Gemini's context window limits
+    translated_dict = {}
+    keys = list(text_dict.keys())
+    # Process in batches
+    for i in range(0, len(keys), batch_size):
+        batch_keys = keys[i:i + batch_size]
+        batch_dict = {key: text_dict[key] for key in batch_keys}
+        translated_batch = translate_batch(batch_dict)
+        # Merge results
+        if translated_batch: # Only merge if the translation was successful
+          translated_dict.update(translated_batch)
+        else:
+            print(f"Skipping batch {i // batch_size} due to translation error.")
+    return translated_dict

word/word_translate.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import os
+import docx
+from docx import Document
+from google import genai  # Use OpenAI for LLM translation
+import ast
+import json
+from docx.oxml import OxmlElement
+from copy import deepcopy
+import io
+from pymongo import MongoClient
+from gridfs import GridFS
+from docx import Document
+from deep_translator import GoogleTranslator
+gemini_api = "AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg"
+target_language = 'vi'
+source_language = 'en'
+def batch_translate(texts, source_lang = 'en', target_lang="fr"):
+    """ Translates multiple text segments in a single API call. """
+    if not texts:
+        return texts  # Skip if empty
+    prompt = f"""
+            Translate the following JSON file from {source_lang} into {target_lang} while preserving names, links, symbols, and formatting:
+            {json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])}
+            - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
+            - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
+            - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
+            - Return only valid JSON — a Python array of translated objects.
+            - If the original array is empty, return an empty array.
+            """
+    client = genai.Client(api_key=gemini_api)
+    response = client.models.generate_content(
+    model="gemini-2.0-flash", contents=prompt)
+    translated_output = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
+    return [item["text"] for item in translated_output]
+def merge_runs(runs):
+    """ Merges adjacent runs with the same style. """
+    merged_runs = []
+    for run in runs:
+        if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run):
+            if (
+            merged_runs and
+            run.style == merged_runs[-1].style and
+            merged_runs[-1].bold == run.bold and
+            merged_runs[-1].italic == run.italic and
+            merged_runs[-1].underline == run.underline and
+            merged_runs[-1].font.size == run.font.size and
+            merged_runs[-1].font.color.rgb == run.font.color.rgb and
+            merged_runs[-1].font.name == run.font.name
+):
+                merged_runs[-1].text += run.text
+        else:
+                merged_runs.append(run)
+    return merged_runs
+NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
+def translate_paragraphs(doc, source_lang, target_lang):
+    paragraphs = []
+    for para in doc.paragraphs:
+        for run in merge_runs(para.iter_inner_content()):
+            if isinstance(run, docx.text.run.Run):
+                paragraphs.append(run.text)
+    # paragraphs = merge_runs(paragraphs)
+    translated_paragraphs = []
+    temp_batch = []
+    words = 0
+    for para in paragraphs:
+        if len(para) + words > 5000:
+            translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
+            temp_batch = []
+            words = 0
+        words += len(para)
+        temp_batch.append(para)
+    translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
+    # translated_paragraphs = batch_translate(paragraphs, target_lang)
+    if len(translated_paragraphs) > 0:
+        # Replace translated text back
+        para_index = 0
+        for para in doc.paragraphs:
+            original_para = deepcopy(para)
+            para.clear()  # Remove text while keeping paragraph properties
+            for run in merge_runs(original_para.iter_inner_content()):
+                if isinstance(run, docx.text.run.Run):
+                    translated_text = translated_paragraphs[para_index]
+                    try:
+                        translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8')  # Ignore invalid characters
+                    except UnicodeEncodeError:
+                        translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8')  # Replace invalid characters
+                    drawing = run._element.find(f".//{NS_W}drawing")
+                    pict = run._element.find(".//{NS_W}pict")
+                    # Create a new run with translated text and copy the formatting
+                    new_run = para.add_run(translated_text)
+                    new_run.style = run.style
+                    if drawing is not None:
+                            new_run._element.append(drawing)
+                    elif pict is not None:
+                        new_run._element.append(pict)
+                    # Copy formatting from original run
+                    new_run.bold = run.bold
+                    new_run.italic = run.italic
+                    new_run.underline = run.underline
+                    new_run.font.size = run.font.size
+                    new_run.font.color.rgb = run.font.color.rgb
+                    new_run.font.name = run.font.name
+                    para_index += 1
+                elif isinstance(run, docx.text.hyperlink.Hyperlink):
+                    parent = run._element
+                    tag = parent.tag.split("}")[-1]
+                    # Create a new hyperlink element with the correct namespace
+                    new_hyperlink = OxmlElement(f"w:{tag}")
+                    for attr in parent.attrib:
+                        new_hyperlink.set(attr, parent.get(attr))
+                    for child in parent:
+                        new_hyperlink.append(child)
+                    para._element.append(new_hyperlink)
+def translate_tables(doc, source_lang, target_lang):
+    table_texts = []
+    run_mapping = {}
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                for para in cell.paragraphs:
+                    for run in merge_runs(para.iter_inner_content()):
+                        if isinstance(run, docx.text.run.Run):
+                            table_texts.append(run.text)
+    translated_tables = []
+    temp_batch = []
+    words = 0
+    for para in table_texts:
+        if len(para) + words > 5000:
+            translated_tables += batch_translate(temp_batch, source_lang, target_lang)
+            temp_batch = []
+            words = 0
+        words += len(para)
+        temp_batch.append(para)
+    translated_tables += batch_translate(temp_batch, source_lang, target_lang)
+    # translated_tables = batch_translate(table_texts, target_lang)
+    if len(translated_tables) > 0:
+        table_index = 0
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    for para in cell.paragraphs:
+                        original_para = deepcopy(para)
+                        para.clear()  # Remove text while keeping paragraph properties
+                        for run in merge_runs(original_para.iter_inner_content()):
+                            if isinstance(run, docx.text.run.Run):
+                                translated_text = translated_tables[table_index]
+                                try:
+                                    translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8')  # Ignore invalid characters
+                                except UnicodeEncodeError:
+                                    translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8')  # Replace invalid characters
+                                drawing = run._element.find(f".//{NS_W}drawing")
+                                pict = run._element.find(".//{NS_W}pict")
+                                # Create a new run with translated text and copy the formatting
+                                new_run = para.add_run(translated_text)
+                                new_run.style = run.style
+                                if drawing is not None:
+                                        new_run._element.append(drawing)
+                                elif pict is not None:
+                                    new_run._element.append(pict)
+                                # Copy formatting from original run
+                                new_run.bold = run.bold
+                                new_run.italic = run.italic
+                                new_run.underline = run.underline
+                                new_run.font.size = run.font.size
+                                new_run.font.color.rgb = run.font.color.rgb
+                                new_run.font.name = run.font.name
+                                table_index += 1
+                            elif isinstance(run, docx.text.hyperlink.Hyperlink):
+                                parent = run._element
+                                tag = parent.tag.split("}")[-1]
+                                # Create a new hyperlink element with the correct namespace
+                                new_hyperlink = OxmlElement(f"w:{tag}")
+                                for attr in parent.attrib:
+                                    new_hyperlink.set(attr, parent.get(attr))
+                                for child in parent:
+                                    new_hyperlink.append(child)
+                                para._element.append(new_hyperlink)
+def translate_header_footer(doc, source_lang, target_lang):
+    head_foot = []
+    for section in doc.sections:
+        for header in section.header.paragraphs:
+            for run in header.runs:
+                head_foot.append(run.text)
+        for footer in section.footer.paragraphs:
+            for run in footer.runs:
+                head_foot.append(run.text)
+    translated_head_foot = batch_translate(head_foot, source_lang, target_lang)
+    i = 0
+    for section in doc.sections:
+        for header in section.header.paragraphs:
+            for run in header.runs:
+                run.text = translated_head_foot[i]
+                i += 1
+        for footer in section.footer.paragraphs:
+            for run in footer.runs:
+                run.text = translated_head_foot[i]
+                i += 1
+def translate_docx(file_id, source_lang='en', target_lang='fr', db_name='word'):
+    client = MongoClient('mongodb://localhost:27017/')
+    db = client[db_name]
+    fs_input = GridFS(db, collection="root_file")
+    fs_output = GridFS(db, collection="final_file")
+    file_data = fs_input.get(file_id).read()
+    input_doc = Document(io.BytesIO(file_data))
+    translate_paragraphs(input_doc, source_lang, target_lang)
+    translate_tables(input_doc, source_lang, target_lang)
+    translate_header_footer(input_doc, source_lang, target_lang)
+    output_stream = io.BytesIO()
+    input_doc.save(output_stream)
+    output_stream.seek(0)
+    translated_file_id = fs_output.put(output_stream, filename=f"{target_lang}_translated.docx")
+    print(f"Translation complete! Saved with file ID: {translated_file_id}")
+    return translated_file_id