Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Mar 25

Commit

e53f591

1 Parent(s): 739e7dc

update word

Browse files

Files changed (5) hide show

db/mongodb.py +1 -1
excel/excel_translate.py +1 -2
pages/upload.py +3 -3
test.ipynb +124 -46
word/word_translate.py +138 -180

db/mongodb.py CHANGED Viewed

@@ -13,7 +13,7 @@ def connect_mongodb(db_name, collection_name):
-def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file", file_tail=".pptx"):
     """
     Lưu file vào MongoDB bằng GridFS mà không kiểm tra trùng lặp.

+def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file"):
     """
     Lưu file vào MongoDB bằng GridFS mà không kiểm tra trùng lặp.

excel/excel_translate.py CHANGED Viewed

@@ -94,7 +94,7 @@ def read_csv_with_auto_encoding(csv_path):
     return df
-def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
     db = client[db_name]
@@ -143,7 +143,6 @@ def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", ch
         # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
-            source_lang=source_lang,
             target_lang=target_lang,
             gemini_api=gemini_api
         )

     return df
+def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
     db = client[db_name]
         # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
             target_lang=target_lang,
             gemini_api=gemini_api
         )

pages/upload.py CHANGED Viewed

@@ -7,7 +7,7 @@ from powerpoint.xml_handling import (
 from translate.translator import translate_text_dict
 from powerpoint.pptx_object import create_translated_ppt
 from excel.excel_translate import translate_xlsx, translate_csv
-from word.word_translate import translate_docx
 import dotenv
 import os
@@ -18,7 +18,7 @@ dotenv.load_dotenv(".env")
 # Cấu hình API key
 api_key = os.getenv("GEMINI_API_KEY")
 genai.configure(api_key=api_key)
-model = genai.GenerativeModel("gemini-1.5-flash")
 # Giao diện Streamlit
 st.title("Please chose your PPTX, Excel file to translate")
@@ -116,7 +116,7 @@ if uploaded_file is not None:
             file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file")
             st.write(f"File ID: {file_id}")
-            final_id = translate_docx(file_id=file_id, source_lang="en", target_lang="vi")
             st.write(f"Final CSV ID: {final_id}")
             if final_id:
                 st.write("✅ File đã sẵn sàng để tải xuống!")

 from translate.translator import translate_text_dict
 from powerpoint.pptx_object import create_translated_ppt
 from excel.excel_translate import translate_xlsx, translate_csv
+from word.word_translate import translate_docx_from_mongodb
 import dotenv
 import os
 # Cấu hình API key
 api_key = os.getenv("GEMINI_API_KEY")
 genai.configure(api_key=api_key)
+model = genai.GenerativeModel("gemini-2.0-flash")
 # Giao diện Streamlit
 st.title("Please chose your PPTX, Excel file to translate")
             file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file")
             st.write(f"File ID: {file_id}")
+            final_id = translate_docx_from_mongodb(file_id = file_id, target_lang="Vietnamese")
             st.write(f"Final CSV ID: {final_id}")
             if final_id:
                 st.write("✅ File đã sẵn sàng để tải xuống!")

test.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -23,9 +23,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from pymongo import MongoClient\n",
     "\n",
@@ -79,7 +87,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -92,7 +100,7 @@
     "    :param collection_name: Tên collection GridFS\n",
     "    \"\"\"\n",
     "    # Kết nối đến MongoDB\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]  # Database của bạn\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)  # Collection để lưu file\n",
     "\n",
@@ -124,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -137,7 +145,7 @@
     "    \"\"\"\n",
     "    try:\n",
     "        # Kết nối MongoDB\n",
-    "        client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "        db = client[db_name]\n",
     "\n",
     "        # Khởi tạo GridFS với collection được chỉ định\n",
@@ -178,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -191,7 +199,7 @@
     "    :param collection_name: Tên collection GridFS\n",
     "    \"\"\"\n",
     "    # Kết nối đến MongoDB\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
@@ -219,40 +227,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def download_pptx_from_mongodb(file_id, save_path, save_name, db_name=\"ppt\", collection_name=\"final_xml\"):\n",
-    "    \"\"\"\n",
-    "    Tải file PowerPoint từ MongoDB GridFS và lưu về máy.\n",
-    "    \n",
-    "    :param file_id:       ID của file cần tải (dạng chuỗi hoặc ObjectId)\n",
-    "    :param save_path:     Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')\n",
-    "    :param save_name:     Tên file khi lưu (VD: 'my_presentation.pptx')\n",
-    "    :param db_name:       Tên database trong MongoDB (mặc định: 'ppt')\n",
-    "    :param collection_name: Tên collection GridFS (mặc định: 'root_file')\n",
-    "    \"\"\"\n",
-    "    # Đảm bảo thư mục lưu file tồn tại\n",
-    "    os.makedirs(save_path, exist_ok=True)\n",
     "\n",
-    "    # Tạo đường dẫn đầy đủ cho file\n",
-    "    full_file_path = os.path.join(save_path, save_name)\n",
     "\n",
-    "    # Kết nối đến MongoDB\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
     "    try:\n",
-    "        # Chuyển đổi ID nếu cần\n",
     "        if not isinstance(file_id, ObjectId):\n",
     "            file_id = ObjectId(file_id)\n",
     "\n",
-    "        # Lấy dữ liệu file từ GridFS\n",
     "        file_data = fs.get(file_id)\n",
     "        \n",
-    "        # Ghi dữ liệu ra file\n",
     "        with open(full_file_path, \"wb\") as f:\n",
     "            f.write(file_data.read())\n",
     "\n",
@@ -265,7 +289,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -278,7 +336,7 @@
     "    :param db_name: Tên database MongoDB\n",
     "    :param collection_name: Tên collection GridFS\n",
     "    \"\"\"\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
@@ -342,7 +400,7 @@
     "    :return: ID của file XML trong MongoDB (original_xml)\n",
     "    \"\"\"\n",
     "    # Kết nối MongoDB\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]\n",
     "\n",
     "    fs_ppt = gridfs.GridFS(db, collection=\"original_pptx\")  # PPT gốc\n",
@@ -391,7 +449,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -406,7 +464,7 @@
     "    :return: Dictionary {slide_number: [text1, text2, ...]}\n",
     "    \"\"\"\n",
     "    # Kết nối MongoDB\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
@@ -472,7 +530,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -485,7 +543,7 @@
     "    :param db_name: Tên database MongoDB\n",
     "    \"\"\"\n",
     "    # Kết nối MongoDB\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "    db = client[db_name]\n",
     "    \n",
     "    fs_original = gridfs.GridFS(db, collection=\"original_xml\")  # Lấy file từ original_xml\n",
@@ -644,10 +702,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Đã xóa 6 file trong collection 'root_file'\n",
       "✅ Đã xóa 1 file trong collection 'final_pptx'\n",
-      "✅ Đã xóa 0 file trong collection 'original_xml'\n",
-      "✅ Đã xóa 0 file trong collection 'final_xml'\n"
      ]
     }
    ],
@@ -656,6 +714,25 @@
     "    delete_all_files_in_collection(i)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 19,
@@ -675,13 +752,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def file_list(collection=\"root_file\"):\n",
-    "    client = MongoClient(\"mongodb://localhost:27017/\")\n",
-    "    db = client[\"csv\"]\n",
     "    fs = gridfs.GridFS(db, collection=collection)\n",
     "    for file in fs.find():\n",
     "        print(f\"📂 File: {file.filename} - ID: {file._id}\")"
@@ -696,16 +773,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "📂 File: test1.xlsx - ID: 67d849b4ef2fcc7f191324f9\n",
-      "📂 File: test3.csv - ID: 67d864962cda0e8d5dd832d5\n",
-      "📂 File: test1.csv - ID: 67d8651a71e13e1efa8d56db\n"
      ]
     }
    ],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kết nối thành công!\n"
+     ]
+    }
+   ],
    "source": [
     "from pymongo import MongoClient\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "    :param collection_name: Tên collection GridFS\n",
     "    \"\"\"\n",
     "    # Kết nối đến MongoDB\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]  # Database của bạn\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)  # Collection để lưu file\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "    \"\"\"\n",
     "    try:\n",
     "        # Kết nối MongoDB\n",
+    "        client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "        db = client[db_name]\n",
     "\n",
     "        # Khởi tạo GridFS với collection được chỉ định\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "    :param collection_name: Tên collection GridFS\n",
     "    \"\"\"\n",
     "    # Kết nối đến MongoDB\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
+    "def download_input_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n",
+    "    os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\input\", exist_ok=True)\n",
     "\n",
+    "    full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\input\", save_name)\n",
     "\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
+    "    db = client[db_name]\n",
+    "    fs = gridfs.GridFS(db, collection=collection_name)\n",
+    "\n",
+    "    try:\n",
+    "        if not isinstance(file_id, ObjectId):\n",
+    "            file_id = ObjectId(file_id)\n",
+    "\n",
+    "        file_data = fs.get(file_id)\n",
+    "        \n",
+    "        with open(full_file_path, \"wb\") as f:\n",
+    "            f.write(file_data.read())\n",
+    "\n",
+    "        print(f\"✅ File đã được tải về: {full_file_path}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Lỗi khi tải file: {e}\")\n",
+    "    finally:\n",
+    "        client.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def download_output_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n",
+    "    os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\output\", exist_ok=True)\n",
+    "\n",
+    "    full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\output\", save_name)\n",
+    "\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
     "    try:\n",
     "        if not isinstance(file_id, ObjectId):\n",
     "            file_id = ObjectId(file_id)\n",
     "\n",
     "        file_data = fs.get(file_id)\n",
     "        \n",
     "        with open(full_file_path, \"wb\") as f:\n",
     "            f.write(file_data.read())\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\chuong 8 NHTM.pptx\n"
+     ]
+    }
+   ],
+   "source": [
+    "download_input_from_mongodb(file_id=\"67dd7148972b1aa4dc9fb83d\", save_name=\"chuong 8 NHTM.pptx\", db_name=\"ppt\", collection_name=\"root_file\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\chuong 8 NHTM.pptx\n"
+     ]
+    }
+   ],
+   "source": [
+    "download_output_from_mongodb(file_id=\"67dd717f972b1aa4dc9fb84f\", save_name=\"chuong 8 NHTM.pptx\", db_name=\"ppt\", collection_name=\"final_pptx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "    :param db_name: Tên database MongoDB\n",
     "    :param collection_name: Tên collection GridFS\n",
     "    \"\"\"\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
     "    :return: ID của file XML trong MongoDB (original_xml)\n",
     "    \"\"\"\n",
     "    # Kết nối MongoDB\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]\n",
     "\n",
     "    fs_ppt = gridfs.GridFS(db, collection=\"original_pptx\")  # PPT gốc\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "    :return: Dictionary {slide_number: [text1, text2, ...]}\n",
     "    \"\"\"\n",
     "    # Kết nối MongoDB\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]\n",
     "    fs = gridfs.GridFS(db, collection=collection_name)\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "    :param db_name: Tên database MongoDB\n",
     "    \"\"\"\n",
     "    # Kết nối MongoDB\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
     "    db = client[db_name]\n",
     "    \n",
     "    fs_original = gridfs.GridFS(db, collection=\"original_xml\")  # Lấy file từ original_xml\n",
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "✅ Đã xóa 4 file trong collection 'root_file'\n",
       "✅ Đã xóa 1 file trong collection 'final_pptx'\n",
+      "✅ Đã xóa 1 file trong collection 'original_xml'\n",
+      "✅ Đã xóa 1 file trong collection 'final_xml'\n"
      ]
     }
    ],
     "    delete_all_files_in_collection(i)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Đã xóa 5 file trong collection 'root_file'\n",
+      "✅ Đã xóa 2 file trong collection 'final_file'\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in ['root_file', 'final_file']:\n",
+    "    delete_all_files_in_collection(i, db_name=\"excel\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 19,
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
     "def file_list(collection=\"root_file\"):\n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
+    "    db = client[\"ppt\"]\n",
     "    fs = gridfs.GridFS(db, collection=collection)\n",
     "    for file in fs.find():\n",
     "        print(f\"📂 File: {file.filename} - ID: {file._id}\")"
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcd8c575cfef63155d3f91\n",
+      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcea4f02257ad0cb04610e\n",
+      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcead0143da29a5c6321ab\n",
+      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dd3bf23cf7ee2f6eca902e\n"
      ]
     }
    ],

word/word_translate.py CHANGED Viewed

@@ -1,60 +1,97 @@
-import os
 import docx
 from docx import Document
 import google.generativeai as genai
 import ast
 import json
-from docx.oxml import OxmlElement
-from copy import deepcopy
 import io
 from pymongo import MongoClient
 from gridfs import GridFS
 from docx import Document
-from deep_translator import GoogleTranslator
-gemini_api = "AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg"
-target_language = 'vi'
-source_language = 'en'
-def batch_translate(texts, source_lang = 'en', target_lang="fr"):
     """ Translates multiple text segments in a single API call. """
     if not texts:
         return texts  # Skip if empty
-    prompt = f"""
-            Translate the following JSON file from {source_lang} into {target_lang} while preserving names, links, symbols, and formatting:
-            {json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])}
             - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
             - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
             - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
-            - Return only valid JSON — a Python array of translated objects.
-            - If the original array is empty, return an empty array.
             """
-    client = genai.Client(api_key=gemini_api)
-    response = client.models.generate_content(
-    model="gemini-2.0-flash", contents=prompt)
-    translated_output = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
-    return [item["text"] for item in translated_output]
 def merge_runs(runs):
     """ Merges adjacent runs with the same style. """
     merged_runs = []
     for run in runs:
-        if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run):
-            if (
-            merged_runs and
             run.style == merged_runs[-1].style and
             merged_runs[-1].bold == run.bold and
             merged_runs[-1].italic == run.italic and
             merged_runs[-1].underline == run.underline and
             merged_runs[-1].font.size == run.font.size and
             merged_runs[-1].font.color.rgb == run.font.color.rgb and
-            merged_runs[-1].font.name == run.font.name
-):
                 merged_runs[-1].text += run.text
         else:
                 merged_runs.append(run)
@@ -62,146 +99,7 @@ def merge_runs(runs):
 NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
-def translate_paragraphs(doc, source_lang, target_lang):
-    paragraphs = []
-    for para in doc.paragraphs:
-        for run in merge_runs(para.iter_inner_content()):
-            if isinstance(run, docx.text.run.Run):
-                paragraphs.append(run.text)
-    # paragraphs = merge_runs(paragraphs)
-    translated_paragraphs = []
-    temp_batch = []
-    words = 0
-    for para in paragraphs:
-        if len(para) + words > 5000:
-            translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
-            temp_batch = []
-            words = 0
-        words += len(para)
-        temp_batch.append(para)
-    translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
-    # translated_paragraphs = batch_translate(paragraphs, target_lang)
-    if len(translated_paragraphs) > 0:
-        # Replace translated text back
-        para_index = 0
-        for para in doc.paragraphs:
-            original_para = deepcopy(para)
-            para.clear()  # Remove text while keeping paragraph properties
-            for run in merge_runs(original_para.iter_inner_content()):
-                if isinstance(run, docx.text.run.Run):
-                    translated_text = translated_paragraphs[para_index]
-                    try:
-                        translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8')  # Ignore invalid characters
-                    except UnicodeEncodeError:
-                        translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8')  # Replace invalid characters
-                    drawing = run._element.find(f".//{NS_W}drawing")
-                    pict = run._element.find(".//{NS_W}pict")
-                    # Create a new run with translated text and copy the formatting
-                    new_run = para.add_run(translated_text)
-                    new_run.style = run.style
-                    if drawing is not None:
-                            new_run._element.append(drawing)
-                    elif pict is not None:
-                        new_run._element.append(pict)
-                    # Copy formatting from original run
-                    new_run.bold = run.bold
-                    new_run.italic = run.italic
-                    new_run.underline = run.underline
-                    new_run.font.size = run.font.size
-                    new_run.font.color.rgb = run.font.color.rgb
-                    new_run.font.name = run.font.name
-                    para_index += 1
-                elif isinstance(run, docx.text.hyperlink.Hyperlink):
-                    parent = run._element
-                    tag = parent.tag.split("}")[-1]
-                    # Create a new hyperlink element with the correct namespace
-                    new_hyperlink = OxmlElement(f"w:{tag}")
-                    for attr in parent.attrib:
-                        new_hyperlink.set(attr, parent.get(attr))
-                    for child in parent:
-                        new_hyperlink.append(child)
-                    para._element.append(new_hyperlink)
-def translate_tables(doc, source_lang, target_lang):
-    table_texts = []
-    run_mapping = {}
-    for table in doc.tables:
-        for row in table.rows:
-            for cell in row.cells:
-                for para in cell.paragraphs:
-                    for run in merge_runs(para.iter_inner_content()):
-                        if isinstance(run, docx.text.run.Run):
-                            table_texts.append(run.text)
-    translated_tables = []
-    temp_batch = []
-    words = 0
-    for para in table_texts:
-        if len(para) + words > 5000:
-            translated_tables += batch_translate(temp_batch, source_lang, target_lang)
-            temp_batch = []
-            words = 0
-        words += len(para)
-        temp_batch.append(para)
-    translated_tables += batch_translate(temp_batch, source_lang, target_lang)
-    # translated_tables = batch_translate(table_texts, target_lang)
-    if len(translated_tables) > 0:
-        table_index = 0
-        for table in doc.tables:
-            for row in table.rows:
-                for cell in row.cells:
-                    for para in cell.paragraphs:
-                        original_para = deepcopy(para)
-                        para.clear()  # Remove text while keeping paragraph properties
-                        for run in merge_runs(original_para.iter_inner_content()):
-                            if isinstance(run, docx.text.run.Run):
-                                translated_text = translated_tables[table_index]
-                                try:
-                                    translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8')  # Ignore invalid characters
-                                except UnicodeEncodeError:
-                                    translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8')  # Replace invalid characters
-                                drawing = run._element.find(f".//{NS_W}drawing")
-                                pict = run._element.find(".//{NS_W}pict")
-                                # Create a new run with translated text and copy the formatting
-                                new_run = para.add_run(translated_text)
-                                new_run.style = run.style
-                                if drawing is not None:
-                                        new_run._element.append(drawing)
-                                elif pict is not None:
-                                    new_run._element.append(pict)
-                                # Copy formatting from original run
-                                new_run.bold = run.bold
-                                new_run.italic = run.italic
-                                new_run.underline = run.underline
-                                new_run.font.size = run.font.size
-                                new_run.font.color.rgb = run.font.color.rgb
-                                new_run.font.name = run.font.name
-                                table_index += 1
-                            elif isinstance(run, docx.text.hyperlink.Hyperlink):
-                                parent = run._element
-                                tag = parent.tag.split("}")[-1]
-                                # Create a new hyperlink element with the correct namespace
-                                new_hyperlink = OxmlElement(f"w:{tag}")
-                                for attr in parent.attrib:
-                                    new_hyperlink.set(attr, parent.get(attr))
-                                for child in parent:
-                                    new_hyperlink.append(child)
-                                para._element.append(new_hyperlink)
-def translate_header_footer(doc, source_lang, target_lang):
     head_foot = []
     for section in doc.sections:
         for header in section.header.paragraphs:
@@ -210,7 +108,7 @@ def translate_header_footer(doc, source_lang, target_lang):
         for footer in section.footer.paragraphs:
             for run in footer.runs:
                 head_foot.append(run.text)
-    translated_head_foot = batch_translate(head_foot, source_lang, target_lang)
     i = 0
     for section in doc.sections:
@@ -222,25 +120,85 @@ def translate_header_footer(doc, source_lang, target_lang):
             for run in footer.runs:
                 run.text = translated_head_foot[i]
                 i += 1
-def translate_docx(file_id, source_lang='en', target_lang='fr', db_name='word'):
-    client = MongoClient('mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0')
-    db = client[db_name]
     fs_input = GridFS(db, collection="root_file")
     fs_output = GridFS(db, collection="final_file")
     file_data = fs_input.get(file_id).read()
-    input_doc = Document(io.BytesIO(file_data))
-    translate_paragraphs(input_doc, source_lang, target_lang)
-    translate_tables(input_doc, source_lang, target_lang)
-    translate_header_footer(input_doc, source_lang, target_lang)
     output_stream = io.BytesIO()
-    input_doc.save(output_stream)
     output_stream.seek(0)
-    translated_file_id = fs_output.put(output_stream, filename=f"{target_lang}_translated.docx")
-    print(f"Translation complete! Saved with file ID: {translated_file_id}")
-    return translated_file_id

 import docx
 from docx import Document
 import google.generativeai as genai
 import ast
 import json
+import re
+import dotenv
+import os
 import io
 from pymongo import MongoClient
 from gridfs import GridFS
 from docx import Document
+dotenv.load_dotenv(".env")
+api_key = os.getenv("GEMINI_API_KEY")
+genai.configure(api_key=api_key)
+model = genai.GenerativeModel("gemini-2.0-flash")
+def batch_translate(texts, target_lang="Vietnamese"):
     """ Translates multiple text segments in a single API call. """
     if not texts:
         return texts  # Skip if empty
+    system_prompt = """ You are given three inputs: source language, target language and a json file.
+            - Your task is to translate the JSON file from the source language (you have to detect the source language yourself) to the target language.
             - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
             - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
+            - The array contains text that makes up whole paragraphs. Make sure that the translation makes sense when the text is put together and retains the same context.
+            - This is very important: Empty spaces should be left as is. For example: From English, "Hello " should be translated into Vietnamese as "Xin chào ", with the same space at the end.
+            - Very frequently there are spaces before or after a string. Do not remove these spaces.
+            - If the source language is English and the target language is Vietnamese and a string contains "'s" in the possessive sense, translate it as "của".
+            - Example: [["WorldQuant's"], ["Mission"]] should be translated as [["Nhiệm vụ"], ["của WorldQuant"]]
+            - Words might be split into multiple continuous arrays. Translate them such that the translation corresponds to the full word.
+            - If a word is split up into multiple arrays, the translation should be such that the word is not split up.
+            - Exampe: ['Tesla sells its pro', 'ducts'] should be translated as ['Tesla bán sản phẩm của mình', ''.]. Note that the number of elements in the output is the same as the input.
+            - Example: [["Hello"], ["World"]] should be translated as [["Xin chào"], ["Thế giới"]]
             - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
+            - Return a JSON object that is a Python array.
+            - Each object in the array is a dictionary with two keys: "index" and "text".
+            - The text should be the translated version of the text in the original object, and the index should stay consistent.
+            - The number of objects in the output MUST the same as the number of objects in the input.
+            - The format of the output should look exactly like the example.
+            - Example:
+            **Input**: Target language: Vietnamese. JSON file:
+            [{"index": 0, "text": ["My name is "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 3, "text": ["Today is "]}, {"index": 4, "text": ["a"]}, {"index": 5, "text": ["good day"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]'
+            **Output**: [{"index": 0, "text": ["Tên tôi là "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 4, "text": ["Hôm nay là "]}, {"index": 3, "text": ["Một"]}, {"index": 5, "text": ["ngày đẹp"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]
+            - Return the result of translation according to the format. Do NOT return code for translating.
             """
+    json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
+    user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
+    model = genai.GenerativeModel('gemini-2.0-flash')
+    response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
+            'temperature': 1,  # Adjust temperature for desired creativity
+            'top_p': 1,
+            'top_k': 1,})
+    response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
+    if len(response_dict) > 0:
+        if isinstance(response_dict[0]['text'], list):
+            translated_texts = [i['text'][0] for i in sorted(response_dict, key = lambda x: x['index'])]
+        elif isinstance(response_dict[0]['text'], str):
+            translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
+    return translated_texts
+def full_translate(texts, target_lang="Vietnamese"):
+    full_translated_texts = []
+    batch = []
+    word_count = 0
+    for string in texts:
+        if len(string.split()) + word_count >= 1000:
+            print('Translating a batch.')
+            full_translated_texts += batch_translate(batch, target_lang)
+            batch = []
+            word_count = 0
+        batch.append(string)
+        word_count += len(string.split())
+    full_translated_texts += batch_translate(batch, target_lang)
+    return full_translated_texts
 def merge_runs(runs):
     """ Merges adjacent runs with the same style. """
     merged_runs = []
     for run in runs:
+        if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and
             run.style == merged_runs[-1].style and
             merged_runs[-1].bold == run.bold and
             merged_runs[-1].italic == run.italic and
             merged_runs[-1].underline == run.underline and
             merged_runs[-1].font.size == run.font.size and
             merged_runs[-1].font.color.rgb == run.font.color.rgb and
+            merged_runs[-1].font.name == run.font.name):
                 merged_runs[-1].text += run.text
         else:
                 merged_runs.append(run)
 NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
+def translate_header_footer(doc, target_lang):
     head_foot = []
     for section in doc.sections:
         for header in section.header.paragraphs:
         for footer in section.footer.paragraphs:
             for run in footer.runs:
                 head_foot.append(run.text)
+    translated_head_foot = full_translate(head_foot, target_lang)
     i = 0
     for section in doc.sections:
             for run in footer.runs:
                 run.text = translated_head_foot[i]
                 i += 1
+def get_text_elements_para(doc):
+    para_texts = []
+    for para in doc.paragraphs:
+        for element in para._element.iter():
+            if element.tag.endswith('t'):
+                if element.text:
+                    emoji_pattern = r'[\U00010000-\U0010FFFF]'
+                    # Split the text but keep emojis as separate elements
+                    parts = re.split(f'({emoji_pattern})', element.text)
+                    for part in parts:
+                        if re.match(emoji_pattern, part):
+                            continue
+                        para_texts.append(part)
+    return para_texts
+def get_text_elements_table(doc):
+    table_texts = []
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                table_texts += get_text_elements_para(cell)
+    return table_texts
+def translate_paragraphs(doc, translated_texts, i = 0):
+    for para in doc.paragraphs:
+        for element in para._element.iter():
+            if element.tag.endswith('t'):
+                if element.text:
+                    emoji_pattern = r'[\U00010000-\U0010FFFF]'
+                    # Split the text but keep emojis as separate elements
+                    parts = re.split(f'({emoji_pattern})', element.text)
+                    for j in range(len(parts)):
+                        if re.match(emoji_pattern, parts[j]):
+                            continue
+                        translated_text = translated_texts[i]
+                        i += 1
+                        parts[j] = translated_text
+                    element.text = "".join(parts)
+    return doc, i
+def translate_tables(doc, translated_texts):
+    i = 0
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                cell, i = translate_paragraphs(cell, translated_texts, i)
+    return doc
+def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
+        # Kết nối MongoDB
+    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client["word"]
     fs_input = GridFS(db, collection="root_file")
     fs_output = GridFS(db, collection="final_file")
+    # Lấy file từ MongoDB
     file_data = fs_input.get(file_id).read()
+    original_file = fs_input.get(file_id).filename  # Lấy tên gốc của file
+    doc = Document(io.BytesIO(file_data))
+    # Lấy nội dung và dịch
+    para_texts = get_text_elements_para(doc)
+    translated_para = full_translate(para_texts, target_lang)
+    table_texts = get_text_elements_table(doc)
+    translated_tables = full_translate(table_texts, target_lang)
+    # Cập nhật nội dung dịch vào document
+    doc, _ = translate_paragraphs(doc, translated_para)
+    doc = translate_tables(doc, translated_tables)
+    translate_header_footer(doc, target_lang)
+    # Lưu file dịch vào MongoDB với cùng tên gốc
     output_stream = io.BytesIO()
+    doc.save(output_stream)
     output_stream.seek(0)
+    translated_file_id = fs_output.put(output_stream, filename=original_file)
+    client.close()
+    return translated_file_id