Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Jun 22

Commit

ff93898

1 Parent(s): b14389a

add no mongodb

Browse files

Files changed (11) hide show

db/__pycache__/mongodb.cpython-310.pyc +0 -0
excel/__pycache__/excel_translate.cpython-310.pyc +0 -0
excel/__pycache__/xlsx.cpython-310.pyc +0 -0
excel/excel_translate.py +16 -139
excel/xlsx.py +14 -25
pages/upload.py +17 -21
powerpoint/__pycache__/pptx.cpython-310.pyc +0 -0
powerpoint/pptx.py +34 -68
test.ipynb +80 -20
word/__pycache__/word_helper.cpython-310.pyc +0 -0
word/word_helper.py +6 -18

db/__pycache__/mongodb.cpython-310.pyc CHANGED Viewed

Binary files a/db/__pycache__/mongodb.cpython-310.pyc and b/db/__pycache__/mongodb.cpython-310.pyc differ

excel/__pycache__/excel_translate.cpython-310.pyc CHANGED Viewed

Binary files a/excel/__pycache__/excel_translate.cpython-310.pyc and b/excel/__pycache__/excel_translate.cpython-310.pyc differ

excel/__pycache__/xlsx.cpython-310.pyc CHANGED Viewed

Binary files a/excel/__pycache__/xlsx.cpython-310.pyc and b/excel/__pycache__/xlsx.cpython-310.pyc differ

excel/excel_translate.py CHANGED Viewed

@@ -1,171 +1,48 @@
-import openpyxl
-from typing import Dict, List
 from translate.translator import translate_text_dict
 import math
 import chardet
-import io
-import pandas as pd
-import pymongo
-import gridfs
-import tempfile
-import os
-def translate_xlsx(file_id: str, target_lang: str = ""):
-    # Kết nối MongoDB
-    client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client["excel"]
-    fs_input = gridfs.GridFS(db, collection="root_file")
-    fs_output = gridfs.GridFS(db, collection="final_file")
-    # Tải file từ MongoDB
-    file_data = fs_input.get(file_id)
-    # Lưu file tạm thời
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
-        temp_file.write(file_data.read())
-        temp_file_path = temp_file.name
-    # Đọc file Excel bằng openpyxl
-    wb = openpyxl.load_workbook(temp_file_path)
-    sheets = wb.worksheets  # Chọn tất cả sheets nếu sheet_name không hợp lệ
-    for ws in sheets:
-        max_row = ws.max_row
-        max_col = ws.max_column
-        # Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
-        text_dict: Dict[str, List[str]] = {}
-        cell_map: Dict[str, any] = {}  # lưu mapping key -> cell object
-        for row in range(1, max_row + 1):
-            for col in range(1, max_col + 1):
-                cell = ws.cell(row=row, column=col)
-                if isinstance(cell.value, str):
-                    key = f"R{row}C{col}"  # key theo dạng R{row}C{col}
-                    text_dict[key] = [cell.value]  # Lưu giá trị dưới dạng danh sách với 1 phần tử
-                    cell_map[key] = cell
-        # Gọi hàm dịch theo dạng bulk
-        translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
-        # Cập nhật lại các cell với nội dung đã dịch
-        for key, cell in cell_map.items():
-            if key in translated_dict:
-                translated_text_list = translated_dict[key]
-                if translated_text_list and len(translated_text_list) > 0:
-                    cell.value = translated_text_list[0]
-    # Lưu workbook vào file tạm thời
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
-        wb.save(output_file.name)
-        output_file.seek(0)
-        translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
-    # Đóng workbook và xóa file tạm
-    wb.close()
-    os.remove(temp_file_path)
-    print(f"✅ Dịch thành công! File đã lưu vào MongoDB với file_id: {translated_file_id}")
-    return translated_file_id
-def read_csv_with_auto_encoding(csv_path):
-    # Đọc file dưới dạng nhị phân
-    with open(csv_path, "rb") as f:
-        raw_data = f.read()
-        # Dò tìm encoding
-        detect_result = chardet.detect(raw_data)
-        encoding = detect_result["encoding"]
-        confidence = detect_result["confidence"]
-        print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
-        # Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
-        if encoding is None:
-            encoding = "utf-8"
     decoded_data = raw_data.decode(encoding, errors='replace')
-    # Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
     csv_data = io.StringIO(decoded_data)
-    df = pd.read_csv(csv_data)
-    return df
-def translate_csv(file_id, source_lang, target_lang="vi", chunk_size=50):
-    # Kết nối MongoDB
-    client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client["csv"]
-    fs_input = gridfs.GridFS(db, collection="root_file")
-    fs_output = gridfs.GridFS(db, collection="final_file")
-    # Tải file từ MongoDB
-    file_data = fs_input.get(file_id).read()
-    # Lưu file tạm thời
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
-        temp_file.write(file_data)
-        temp_file_path = temp_file.name
-    df = read_csv_with_auto_encoding(temp_file_path)
-    # If text_columns is not specified, we assume we want to translate everything that looks like text.
-    # Otherwise, only translate the given columns.
     text_columns = df.select_dtypes(include=["object"]).columns.tolist()
     num_rows = len(df)
     num_chunks = math.ceil(num_rows / chunk_size)
-    translated_df = df.copy()  # copy to store the final translations
     for chunk_index in range(num_chunks):
         start_idx = chunk_index * chunk_size
         end_idx = min((chunk_index + 1) * chunk_size, num_rows)
         chunk_df = df.iloc[start_idx:end_idx]
-        # Build a dictionary structure. For example, row-based:
-        # {
-        #   "0": {"colA": "some text", "colB": "some text"},
-        #   "1": {"colA": "some text", "colB": "some text"},
-        #   ...
-        # }
         chunk_dict = {}
         for i, row in chunk_df.iterrows():
-            row_dict = {}
-            for col in text_columns:
-                row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
             chunk_dict[str(i)] = row_dict
-        # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
             source_lang=source_lang,
             target_lang=target_lang
         )
-        # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
         for i_str, row_data in translated_chunk.items():
             i = int(i_str)
             for col, translated_val in row_data.items():
                 translated_df.at[i, col] = translated_val
-    # Lưu file dịch vào tệp tạm thời
-    translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
-    translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
-    # Đọc lại file tạm để lưu vào MongoDB
-    with open(translated_file_path, "rb") as f:
-        translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
-    # Xóa file tạm
-    os.remove(temp_file_path)
-    os.remove(translated_file_path)
-    print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
-    return translated_file_id

+import io
+import pandas as pd
 from translate.translator import translate_text_dict
 import math
 import chardet
+def read_csv_with_auto_encoding_from_bytes(csv_bytes) -> pd.DataFrame:
+    raw_data = csv_bytes.read()
+    detect_result = chardet.detect(raw_data)
+    encoding = detect_result["encoding"] or "utf-8"
     decoded_data = raw_data.decode(encoding, errors='replace')
     csv_data = io.StringIO(decoded_data)
+    return pd.read_csv(csv_data)
+def translate_csv(file_bytes, file_name, source_lang: str, target_lang: str = "vi", chunk_size: int = 50) -> bytes:
+    df = read_csv_with_auto_encoding_from_bytes(file_bytes)
     text_columns = df.select_dtypes(include=["object"]).columns.tolist()
     num_rows = len(df)
     num_chunks = math.ceil(num_rows / chunk_size)
+    translated_df = df.copy()
     for chunk_index in range(num_chunks):
         start_idx = chunk_index * chunk_size
         end_idx = min((chunk_index + 1) * chunk_size, num_rows)
         chunk_df = df.iloc[start_idx:end_idx]
         chunk_dict = {}
         for i, row in chunk_df.iterrows():
+            row_dict = {col: str(row[col]) if pd.notnull(row[col]) else "" for col in text_columns}
             chunk_dict[str(i)] = row_dict
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
             source_lang=source_lang,
             target_lang=target_lang
         )
         for i_str, row_data in translated_chunk.items():
             i = int(i_str)
             for col, translated_val in row_data.items():
                 translated_df.at[i, col] = translated_val
+    output_buffer = io.BytesIO()
+    translated_df.to_csv(output_buffer, index=False, encoding='utf-8-sig')
+    output_buffer.seek(0)
+    return output_buffer, file_name

excel/xlsx.py CHANGED Viewed

@@ -406,9 +406,12 @@ def translate_sheet_names_via_regex(
         traceback.print_exc()
-def zip_folder_to_excel_file(folder_path, file_name):
     try:
-        # Nén thư mục thành file .xlsx trong RAM
         xlsx_buffer = io.BytesIO()
         with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
             for root, _, files in os.walk(folder_path):
@@ -418,19 +421,11 @@ def zip_folder_to_excel_file(folder_path, file_name):
                     zipf.write(file_path, archive_path)
         xlsx_buffer.seek(0)
-        client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-        db = client['excel']
-        fs = gridfs.GridFS(db, collection='final_file')
-        file_id = fs.put(xlsx_buffer.read(), filename=file_name)
-        print(f"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}")
-        return file_id
     except Exception as e:
-        print(f"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}")
-        return None
 def get_text_list_from_nodes(modifiable_nodes: Optional[List[Dict[str, Any]]]) -> List[str]:
     if modifiable_nodes is None:
@@ -482,16 +477,10 @@ def _translate_batch_helper(segments_to_translate, original_indices_1based, sour
     return batch_results
-def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):
-    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client['excel']
-    fs = gridfs.GridFS(db, collection='root_file')
-    ppt_file = fs.get(file_id)
-    excel_file = BytesIO(ppt_file.read())
-    xml_folder = unzip_office_file(excel_file)
     path_to_workbook_xml = os.path.join(xml_folder, "xl", "workbook.xml")
     translate_sheet_names_via_regex(path_to_workbook_xml, source_lang, target_lang)
@@ -600,9 +589,9 @@ def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch
             print("LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.")
         else:
             # Only zip if saving XML was successful
-            final_id = zip_folder_to_excel_file(xml_folder, file_name)
-            if final_id:
                 shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping
             else:
                 print("LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.")
-    return final_id

         traceback.print_exc()
+def zip_folder_to_excel_bytes(folder_path):
+    """
+    Nén toàn bộ thư mục thành file Excel (.xlsx) dưới dạng BytesIO (trong RAM).
+    Trả lại buffer BytesIO chứa nội dung file.
+    """
     try:
         xlsx_buffer = io.BytesIO()
         with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
             for root, _, files in os.walk(folder_path):
                     zipf.write(file_path, archive_path)
         xlsx_buffer.seek(0)
+        return xlsx_buffer
     except Exception as e:
+        print(f"❌ Lỗi khi nén thư mục thành file Excel: {e}")
+        return None
 def get_text_list_from_nodes(modifiable_nodes: Optional[List[Dict[str, Any]]]) -> List[str]:
     if modifiable_nodes is None:
     return batch_results
+def translate_xlsx(file_io, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):
+    file_io.seek(0)
+    xml_folder = unzip_office_file(file_io)
     path_to_workbook_xml = os.path.join(xml_folder, "xl", "workbook.xml")
     translate_sheet_names_via_regex(path_to_workbook_xml, source_lang, target_lang)
             print("LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.")
         else:
             # Only zip if saving XML was successful
+            translated_buffer = zip_folder_to_excel_bytes(xml_folder)
+            if translated_buffer:
                 shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping
             else:
                 print("LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.")
+    return translated_buffer, file_name

pages/upload.py CHANGED Viewed

@@ -9,10 +9,9 @@ import dotenv
 import os
 dotenv.load_dotenv(".env")
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
-# Cấu hình trang
 st.set_page_config(page_title="Translate Your File", page_icon="🌍", layout="centered")
 # CSS custom
@@ -55,7 +54,6 @@ st.markdown("""
             color: black !important;
             border-radius: 8px;
         }
-        /* Thu hẹp khoảng cách giữa label và selectbox */
         .stSelectbox label {
             margin-bottom: 0.2rem;
             font-weight: bold;
@@ -65,11 +63,12 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
-# Upload file section
 with st.container():
     st.markdown("### 📂 Chọn file để dịch")
     uploaded_file = st.file_uploader("Kéo thả hoặc chọn file", type=['pptx', 'xlsx', 'csv', 'docx'])
 with st.container():
     col1, col2 = st.columns(2)
@@ -81,39 +80,36 @@ with st.container():
         st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ muốn dịch sang</p>', unsafe_allow_html=True)
         target_lang = st.selectbox("  ", ["chinese", "english", "vietnamese"], key="target_lang")
 def process_file(file, file_type):
     progress_bar = st.progress(0)
-    with st.spinner("🔄 Đang lưu file lên hệ thống..."):
-        file_id, file_name = save_file_to_mongodb(uploaded_file=file, db_name=file_type.lower(), collection_name="root_file")
-        progress_bar.progress(20)
-        st.write(f"📂 **File ID:** `{file_id}`")
     with st.spinner("🔍 Đang xử lý và dịch tài liệu..."):
         if file_type == "PPTX":
-            final_id = translate_pptx(file_id, file_name, source_lang=source_lang, target_lang=target_lang, slides_per_batch=5)
-            progress_bar.progress(60)
         elif file_type == "Excel":
-            final_id = translate_xlsx(file_id=file_id, file_name=file_name, source_lang=source_lang, target_lang=target_lang)
         elif file_type == "CSV":
-            final_id = translate_csv(file_id=file_id, source_lang=source_lang, target_lang=target_lang)
         elif file_type == "Word":
-            final_id = translate_docx(file_id=file_id, file_name=file_name, source_lang=source_lang, target_lang=target_lang)
         else:
             st.error("❌ Loại file không hỗ trợ!")
             return
-    progress_bar.progress(80)
-    with st.spinner("📦 Đang tải file đã dịch..."):
-        file_io, file_name = fetch_file_from_mongodb(file_type.lower(), "final_file", final_id)
-        progress_bar.progress(100)
-    if file_io:
         st.success("🎉 File đã được dịch thành công!")
-        st.download_button("⬇️ Tải file về", data=file_io.getvalue(), file_name=file_name)
     else:
-        st.error("❌ Không thể tải xuống file. Vui lòng thử lại!")
 if uploaded_file and st.button("🚀 Upload và dịch ngay!"):
     with st.spinner("🔎 Đang phát hiện loại file..."):
         file_type = detect_file_type(uploaded_file)

 import os
 dotenv.load_dotenv(".env")
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# Cấu hình giao diện
 st.set_page_config(page_title="Translate Your File", page_icon="🌍", layout="centered")
 # CSS custom
             color: black !important;
             border-radius: 8px;
         }
         .stSelectbox label {
             margin-bottom: 0.2rem;
             font-weight: bold;
     </style>
 """, unsafe_allow_html=True)
+# Upload file
 with st.container():
     st.markdown("### 📂 Chọn file để dịch")
     uploaded_file = st.file_uploader("Kéo thả hoặc chọn file", type=['pptx', 'xlsx', 'csv', 'docx'])
+# Lựa chọn ngôn ngữ
 with st.container():
     col1, col2 = st.columns(2)
         st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ muốn dịch sang</p>', unsafe_allow_html=True)
         target_lang = st.selectbox("  ", ["chinese", "english", "vietnamese"], key="target_lang")
+# Xử lý file trực tiếp
 def process_file(file, file_type):
     progress_bar = st.progress(0)
+    file_name = file.name
+    progress_bar.progress(10)
     with st.spinner("🔍 Đang xử lý và dịch tài liệu..."):
         if file_type == "PPTX":
+            output_io, output_name = translate_pptx(file, file_name, source_lang=source_lang, target_lang=target_lang)
         elif file_type == "Excel":
+            output_io, output_name = translate_xlsx(file, file_name, source_lang=source_lang, target_lang=target_lang)
         elif file_type == "CSV":
+            output_io, output_name = translate_csv(file, file_name, source_lang=source_lang, target_lang=target_lang)
         elif file_type == "Word":
+            output_io, output_name = translate_docx(file, file_name, source_lang=source_lang, target_lang=target_lang)
         else:
             st.error("❌ Loại file không hỗ trợ!")
             return
+    progress_bar.progress(100)
+    if output_io:
         st.success("🎉 File đã được dịch thành công!")
+        print(f"✅ File đã dịch: {output_name}")
+        st.download_button("⬇️ Tải file về", data=output_io.getvalue(), file_name=output_name)
     else:
+        st.error("❌ Xảy ra lỗi khi xử lý file.")
+# Nút xử lý
 if uploaded_file and st.button("🚀 Upload và dịch ngay!"):
     with st.spinner("🔎 Đang phát hiện loại file..."):
         file_type = detect_file_type(uploaded_file)

powerpoint/__pycache__/pptx.cpython-310.pyc CHANGED Viewed

Binary files a/powerpoint/__pycache__/pptx.cpython-310.pyc and b/powerpoint/__pycache__/pptx.cpython-310.pyc differ

powerpoint/pptx.py CHANGED Viewed

@@ -3,15 +3,13 @@ import zipfile
 import shutil
 from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
 from powerpoint.xml_handling import *
-from pymongo import MongoClient
-import gridfs
 from io import BytesIO
-def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
     """
-    Tạo file PPTX từ thư mục chứa nội dung đã giải nén và lưu vào MongoDB mà không lưu file trên ổ cứng.
     """
-    pptx_buffer = BytesIO()
     with zipfile.ZipFile(pptx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
         for root_dir, _, files in os.walk(temp_dir):
@@ -20,45 +18,32 @@ def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
                 arcname = os.path.relpath(file_path, temp_dir)
                 zipf.write(file_path, arcname)
-    pptx_buffer.seek(0)
-    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client['pptx']
-    fs = gridfs.GridFS(db, collection='final_file')
-    file_id = fs.put(pptx_buffer, filename=pptx_filename)
-    print(f"PPTX đã được lưu vào MongoDB với ID: {file_id}")
-    client.close()
-    return file_id
-def translate_and_replace_pptx(xml_folder, file_name, source_lang='vn', target_lang='en', slides_per_batch=5):
     slides_dir = os.path.join(xml_folder, "ppt/slides")
     all_slides = sorted([f for f in os.listdir(slides_dir)
                          if f.startswith("slide") and f.endswith(".xml")],
                         key=lambda x: int(x[5:-4]))
-    # Xử lý theo từng batch slide
     for i in range(0, len(all_slides), slides_per_batch):
         batch_slides = all_slides[i:i + slides_per_batch]
-        slide_text_mapping = {}
-        smartart_text_mapping = {}
         for slide_file in batch_slides:
             slide_index = int(slide_file[5:-4])
             slide_path = os.path.join(slides_dir, slide_file)
-            slide_text_mapping[slide_index] = extract_text_from_slide(slide_path)  # Lấy list các tuple (text, rPr)
-            # Xử lý SmartArt qua file .rels của slide
             rels_file = os.path.join(xml_folder, "ppt/slides/_rels", slide_file + ".rels")
             base_path = os.path.join(xml_folder, "ppt")
             smartart_data_path = get_smartart_data_file(rels_file, base_path)
             if smartart_data_path:
-                smartart_text_mapping[slide_index] = extract_text_from_smartart(smartart_data_path) # Lấy list các tuple (text, rPr)
-        # Gộp text để dịch theo batch, giữ lại rPr
         combined_slide_text_list = []
         for slide_index in sorted(slide_text_mapping.keys()):
             combined_slide_text_list.extend(slide_text_mapping[slide_index])
@@ -66,48 +51,33 @@ def translate_and_replace_pptx(xml_folder, file_name, source_lang='vn', target_l
         combined_smartart_text_list = []
         for slide_index in sorted(smartart_text_mapping.keys()):
             combined_smartart_text_list.extend(smartart_text_mapping[slide_index])
-        # Tách text ra khỏi tuple để dịch
         slide_texts_to_translate = [text for text, _ in combined_slide_text_list]
         smartart_texts_to_translate = [text for text, _ in combined_smartart_text_list]
-        # Dịch văn bản slide và SmartArt
-        combined_slide_text_string = preprocess_text(slide_texts_to_translate)
-        combined_smartart_text_string = preprocess_text(smartart_texts_to_translate)
-        translated_slide_string = translate_text(combined_slide_text_string, source_lang, target_lang)
-        translated_smartart_string = translate_text(combined_smartart_text_string, source_lang, target_lang)
-        # Postprocess để có list các văn bản đã dịch
         translated_slide_texts = postprocess_text(translated_slide_string)
         translated_smartart_texts = postprocess_text(translated_smartart_string)
-        # **Quan trọng:** Tạo danh sách tuple (translated_text, rPr)
         translated_slide_data = []
         for i, (original_text, rPr) in enumerate(combined_slide_text_list):
-            if i < len(translated_slide_texts):
-                translated_slide_data.append((translated_slide_texts[i], rPr))
-            else:
-                translated_slide_data.append(("", rPr)) # Trường hợp không đủ translated text
         translated_smartart_data = []
         for i, (original_text, rPr) in enumerate(combined_smartart_text_list):
-            if i < len(translated_smartart_texts):
-                translated_smartart_data.append((translated_smartart_texts[i], rPr))
-            else:
-                translated_smartart_data.append(("", rPr))  # Trường hợp không đủ translated text
-        # Thay thế văn bản trong slide
-        slide_index = 0
         for slide_index in sorted(slide_text_mapping.keys()):
             slide_file = f"slide{slide_index}.xml"
             slide_path = os.path.join(slides_dir, slide_file)
             num_texts = len(slide_text_mapping[slide_index])
             replace_data = translated_slide_data[:num_texts]
-            replace_text_in_slide(slide_path, replace_data) # truyền vào danh sách (translated_text, rPr)
-            translated_slide_data = translated_slide_data[num_texts:]  # Cập nhật danh sách cho slide tiếp theo
-        # Thay thế văn bản trong SmartArt
         for slide_index in sorted(smartart_text_mapping.keys()):
             rels_file = os.path.join(xml_folder, "ppt/slides/_rels", f"slide{slide_index}.xml.rels")
             base_path = os.path.join(xml_folder, "ppt")
@@ -115,23 +85,19 @@ def translate_and_replace_pptx(xml_folder, file_name, source_lang='vn', target_l
             if smartart_data_path:
                 num_texts = len(smartart_text_mapping[slide_index])
                 replace_data = translated_smartart_data[:num_texts]
-                replace_text_in_smartart(smartart_data_path, replace_data, None) # truyền vào danh sách (translated_text, rPr)
-                translated_smartart_data = translated_smartart_data[num_texts:]  # Cập nhật danh sách cho slide tiếp theo
-    file_id = create_pptx_and_store_in_mongodb(xml_folder, file_name)
-    return file_id
-def translate_pptx(pptx_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5):
-    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client['pptx']
-    fs = gridfs.GridFS(db, collection='root_file')
-    ppt_file = fs.get(pptx_id)
-    prs = BytesIO(ppt_file.read())
-    xml_folder = unzip_office_file(prs)
-    file_id =  translate_and_replace_pptx(xml_folder, file_name, source_lang, target_lang, slides_per_batch=slides_per_batch)
-    shutil.rmtree(xml_folder)
-    return file_id

 import shutil
 from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
 from powerpoint.xml_handling import *
 from io import BytesIO
+def create_pptx_from_dir(temp_dir, pptx_filename):
     """
+    Tạo file PPTX từ thư mục chứa nội dung đã giải nén và trả về BytesIO object.
     """
+    pptx_buffer = BytesIO()
     with zipfile.ZipFile(pptx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
         for root_dir, _, files in os.walk(temp_dir):
                 arcname = os.path.relpath(file_path, temp_dir)
                 zipf.write(file_path, arcname)
+    pptx_buffer.seek(0)
+    return pptx_buffer, pptx_filename
+def translate_and_replace_pptx(xml_folder, source_lang='vn', target_lang='en', slides_per_batch=5):
     slides_dir = os.path.join(xml_folder, "ppt/slides")
     all_slides = sorted([f for f in os.listdir(slides_dir)
                          if f.startswith("slide") and f.endswith(".xml")],
                         key=lambda x: int(x[5:-4]))
     for i in range(0, len(all_slides), slides_per_batch):
         batch_slides = all_slides[i:i + slides_per_batch]
+        slide_text_mapping = {}
+        smartart_text_mapping = {}
         for slide_file in batch_slides:
             slide_index = int(slide_file[5:-4])
             slide_path = os.path.join(slides_dir, slide_file)
+            slide_text_mapping[slide_index] = extract_text_from_slide(slide_path)
             rels_file = os.path.join(xml_folder, "ppt/slides/_rels", slide_file + ".rels")
             base_path = os.path.join(xml_folder, "ppt")
             smartart_data_path = get_smartart_data_file(rels_file, base_path)
             if smartart_data_path:
+                smartart_text_mapping[slide_index] = extract_text_from_smartart(smartart_data_path)
+        # Gộp text
         combined_slide_text_list = []
         for slide_index in sorted(slide_text_mapping.keys()):
             combined_slide_text_list.extend(slide_text_mapping[slide_index])
         combined_smartart_text_list = []
         for slide_index in sorted(smartart_text_mapping.keys()):
             combined_smartart_text_list.extend(smartart_text_mapping[slide_index])
+        # Dịch
         slide_texts_to_translate = [text for text, _ in combined_slide_text_list]
         smartart_texts_to_translate = [text for text, _ in combined_smartart_text_list]
+        translated_slide_string = translate_text(preprocess_text(slide_texts_to_translate), source_lang, target_lang)
+        translated_smartart_string = translate_text(preprocess_text(smartart_texts_to_translate), source_lang, target_lang)
         translated_slide_texts = postprocess_text(translated_slide_string)
         translated_smartart_texts = postprocess_text(translated_smartart_string)
         translated_slide_data = []
         for i, (original_text, rPr) in enumerate(combined_slide_text_list):
+            translated_slide_data.append((translated_slide_texts[i] if i < len(translated_slide_texts) else "", rPr))
         translated_smartart_data = []
         for i, (original_text, rPr) in enumerate(combined_smartart_text_list):
+            translated_smartart_data.append((translated_smartart_texts[i] if i < len(translated_smartart_texts) else "", rPr))
         for slide_index in sorted(slide_text_mapping.keys()):
             slide_file = f"slide{slide_index}.xml"
             slide_path = os.path.join(slides_dir, slide_file)
             num_texts = len(slide_text_mapping[slide_index])
             replace_data = translated_slide_data[:num_texts]
+            replace_text_in_slide(slide_path, replace_data)
+            translated_slide_data = translated_slide_data[num_texts:]
         for slide_index in sorted(smartart_text_mapping.keys()):
             rels_file = os.path.join(xml_folder, "ppt/slides/_rels", f"slide{slide_index}.xml.rels")
             base_path = os.path.join(xml_folder, "ppt")
             if smartart_data_path:
                 num_texts = len(smartart_text_mapping[slide_index])
                 replace_data = translated_smartart_data[:num_texts]
+                replace_text_in_smartart(smartart_data_path, replace_data, None)
+                translated_smartart_data = translated_smartart_data[num_texts:]
+def translate_pptx(file_obj: BytesIO, file_name: str, source_lang='vn', target_lang='en', slides_per_batch=5):
+    """
+    Hàm chính: nhận file PPTX (BytesIO), dịch, và trả về BytesIO của file đã dịch.
+    """
+    file_obj.seek(0)
+    xml_folder = unzip_office_file(file_obj)
+    translate_and_replace_pptx(xml_folder, source_lang, target_lang, slides_per_batch)
+    translated_io, translated_filename = create_pptx_from_dir(xml_folder, file_name)
+    shutil.rmtree(xml_folder)
+    return translated_io, translated_filename

test.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -150,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -181,36 +181,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\MXL1166配套表.docx\n"
      ]
     }
    ],
    "source": [
-    "download_input_from_mongodb(file_id=\"684002b9047f70beae0bdf2e\", save_name=\"MXL1166配套表.docx\", db_name=\"word\", collection_name=\"root_file\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\MXL1147配套表.docx\n"
      ]
     }
    ],
    "source": [
-    "download_output_from_mongodb(file_id=\"68400205047f70beae0bdf2a\", save_name=\"MXL1147配套表.docx\", db_name=\"word\", collection_name=\"final_file\")"
    ]
   },
   {
@@ -244,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -252,12 +252,12 @@
      "output_type": "stream",
      "text": [
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'word'\n",
-      "✅ Đã xóa 4 file trong collection 'root_file' của db 'excel'\n",
-      "✅ Đã xóa 0 file trong collection 'root_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'csv'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'word'\n",
-      "✅ Đã xóa 4 file trong collection 'final_file' của db 'excel'\n",
-      "✅ Đã xóa 0 file trong collection 'final_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'csv'\n"
      ]
     }
@@ -270,16 +270,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "91.66238403320312"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -291,11 +291,71 @@
     "for db_name in ['word', 'exce', 'pptx', 'csv']:\n",
     "    db = client[db_name]\n",
     "    stats = db.command(\"dbstats\")\n",
-    "    db_size = stats.get(\"dataSize\", 0)\n",
     "    total_size += db_size\n",
     "total_size / (1024** 2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 9,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\szero-point-ieks-essays-1350537845-9781350537842_compress.docx\n"
      ]
     }
    ],
    "source": [
+    "download_input_from_mongodb(file_id=\"6843696876015abc15cc759f\", save_name=\"szero-point-ieks-essays-1350537845-9781350537842_compress.docx\", db_name=\"word\", collection_name=\"root_file\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\samsung_presentation_vietnamese.pptx\n"
      ]
     }
    ],
    "source": [
+    "download_output_from_mongodb(file_id=\"684194c376015abc15cc7428\", save_name=\"samsung_presentation_vietnamese.pptx\", db_name=\"pptx\", collection_name=\"final_file\")"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'word'\n",
+      "✅ Đã xóa 0 file trong collection 'root_file' của db 'excel'\n",
+      "✅ Đã xóa 3 file trong collection 'root_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'csv'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'word'\n",
+      "✅ Đã xóa 0 file trong collection 'final_file' của db 'excel'\n",
+      "✅ Đã xóa 3 file trong collection 'final_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'csv'\n"
      ]
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "0.0"
       ]
      },
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
     "for db_name in ['word', 'exce', 'pptx', 'csv']:\n",
     "    db = client[db_name]\n",
     "    stats = db.command(\"dbstats\")\n",
+    "    db_size = stats.get(\"StorageSize\", 0)\n",
     "    total_size += db_size\n",
     "total_size / (1024** 2)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Đã lấy 0 dữ liệu đo lường.\n",
+      "⚠️ Không có dữ liệu đo lường.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "from requests.auth import HTTPDigestAuth\n",
+    "import datetime\n",
+    "\n",
+    "# ==== Cấu hình ====\n",
+    "PUBLIC_KEY = 'uetgyqkj'\n",
+    "PRIVATE_KEY = '892caec5-8474-4043-862b-f4d4c617daa2'\n",
+    "GROUP_ID = '67db8bf4ed971c2114aad7f1#'         # còn gọi là Project ID\n",
+    "CLUSTER_NAME = 'Cluster0'\n",
+    "\n",
+    "# ==== Lấy metric dung lượng dữ liệu ====\n",
+    "url = f\"https://cloud.mongodb.com/api/atlas/v1.0/groups/{GROUP_ID}/clusters/{CLUSTER_NAME}/measurements\"\n",
+    "\n",
+    "params = {\n",
+    "    \"granularity\": \"PT1M\",           # lấy theo từng phút\n",
+    "    \"period\": \"PT1H\",                # 5 phút gần nhất\n",
+    "    \"m\": \"DATA_SIZE_TOTAL\",          # metric cần lấy\n",
+    "}\n",
+    "\n",
+    "response = requests.get(\n",
+    "    url,\n",
+    "    auth=HTTPDigestAuth(PUBLIC_KEY, PRIVATE_KEY),\n",
+    "    params=params\n",
+    ")\n",
+    "\n",
+    "# ==== Xử lý kết quả ====\n",
+    "if response.status_code == 200:\n",
+    "    data = response.json()\n",
+    "    measurements = data.get(\"measurements\", [])\n",
+    "    print(f\"✅ Đã lấy {len(measurements)} dữ liệu đo lường.\")\n",
+    "    if measurements:\n",
+    "        datapoints = measurements[0].get(\"dataPoints\", [])\n",
+    "        if datapoints:\n",
+    "            latest_point = [d for d in datapoints if d['value'] is not None][-1]\n",
+    "            value_bytes = latest_point['value']\n",
+    "            ts = latest_point['timestamp']\n",
+    "            print(f\"✅ Dung lượng hiện tại: {value_bytes / (1024**2):.2f} MB (timestamp: {ts})\")\n",
+    "        else:\n",
+    "            print(\"⚠️ Không có datapoint nào.\")\n",
+    "    else:\n",
+    "        print(\"⚠️ Không có dữ liệu đo lường.\")\n",
+    "else:\n",
+    "    print(f\"❌ Lỗi {response.status_code}: {response.text}\")\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 9,

word/__pycache__/word_helper.cpython-310.pyc CHANGED Viewed

Binary files a/word/__pycache__/word_helper.cpython-310.pyc and b/word/__pycache__/word_helper.cpython-310.pyc differ

word/word_helper.py CHANGED Viewed

@@ -8,8 +8,6 @@ import re
 import time
 import dotenv
 import os
-from pymongo import MongoClient
-import gridfs
 from io import BytesIO
 dotenv.load_dotenv(".env")
@@ -253,18 +251,13 @@ def merge_elements(doc):
                                 current_run = [element]
     return doc
-def translate_docx(file_id, source_lang="English", target_lang="Vietnamese", file_name=''):
-    """Translates a Word document and saves the output to MongoDB."""
-    client = MongoClient(os.getenv("MONGODB_URI"))
-    db = client["word"]
-    fs_input = gridfs.GridFS(db, collection="root_file")
-    fs_output = gridfs.GridFS(db, collection="final_file")
-    # Lấy file gốc từ MongoDB
-    input_file = fs_input.get(file_id)
-    doc = Document(BytesIO(input_file.read()))
-    # Dịch nội dung
     doc = merge_elements(doc)
     print('Translating paragraphs.')
@@ -286,13 +279,8 @@ def translate_docx(file_id, source_lang="English", target_lang="Vietnamese", fil
     translate_header_footer(doc, source_lang, target_lang)
     print('Done translating headers & footers.')
-    # Lưu tài liệu đã dịch vào MongoDB
     output_stream = BytesIO()
     doc.save(output_stream)
     output_stream.seek(0)
-    translated_file_id = fs_output.put(output_stream, filename=file_name)
-    client.close()
-    print(f"Translation complete! Saved to MongoDB with id: {translated_file_id}")
-    return translated_file_id

 import time
 import dotenv
 import os
 from io import BytesIO
 dotenv.load_dotenv(".env")
                                 current_run = [element]
     return doc
+def translate_docx(uploaded_file, file_name, source_lang="English", target_lang="Vietnamese"):
+    """
+    Translates a Word document passed as a Streamlit UploadedFile and returns a BytesIO object.
+    """
+    doc = Document(uploaded_file)
     doc = merge_elements(doc)
     print('Translating paragraphs.')
     translate_header_footer(doc, source_lang, target_lang)
     print('Done translating headers & footers.')
     output_stream = BytesIO()
     doc.save(output_stream)
     output_stream.seek(0)
+    return output_stream, file_name