{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Được thôi, đây là một bài thơ haiku về lập trình Python:\n",
      "\n",
      "Cú pháp thật mềm,\n",
      "Code chạy nhanh, không lỗi lầm.\n",
      "Vui khi được code.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import google.generativeai as genai\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "import os\n",
    "\n",
    "genai.configure(api_key=os.getenv(\"GEMINI_API_KEY\"))  # Thay thế bằng khóa API của bạn\n",
    "\n",
    "model = genai.GenerativeModel(\"gemini-2.0-flash-lite\")  # hoặc gemini-1.5-pro-latest\n",
    "\n",
    "response = model.generate_content(\"Viết một bài thơ haiku về lập trình Python.\")\n",
    "print(response.text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient\n",
    "import gridfs\n",
    "from bson import ObjectId\n",
    "import os\n",
    "from pptx import Presentation\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from xml.dom import minidom\n",
    "import xml.etree.ElementTree as ET\n",
    "# from pptx_object import get_table_properties, get_shape_properties\n",
    "from pptx.enum.shapes import MSO_SHAPE_TYPE\n",
    "from typing import Dict, List, Optional\n",
    "import json\n",
    "# from translator import translate_text_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Delete file in DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def delete_all_files_in_collection(collection_name, db_name=\"ppt\"):\n",
    "    try:\n",
    "        # Kết nối MongoDB\n",
    "        client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "        db = client[db_name]\n",
    "\n",
    "        # Khởi tạo GridFS với collection được chỉ định\n",
    "        fs = gridfs.GridFS(db, collection=collection_name)\n",
    "\n",
    "        # Lấy danh sách file_id của tất cả file trong GridFS\n",
    "        file_ids = [file[\"_id\"] for file in db[f\"{collection_name}.files\"].find({})]\n",
    "\n",
    "        # Xóa từng file trong GridFS\n",
    "        for file_id in file_ids:\n",
    "            fs.delete(file_id)\n",
    "\n",
    "        print(f\"✅ Đã xóa {len(file_ids)} file trong collection '{collection_name}' của db '{db_name}'\")\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Lỗi khi xóa file: {str(e)}\")\n",
    "\n",
    "    finally:\n",
    "        client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Đã xóa 3 file trong collection 'final_pptx'\n"
     ]
    }
   ],
   "source": [
    "delete_all_files_in_collection(\"final_pptx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def delete_pptx_from_mongodb(file_id, db_name=\"ppt\", collection_name=\"_xml\"):\n",
    "    \"\"\"\n",
    "    Xóa file PowerPoint khỏi MongoDB theo ID.\n",
    "\n",
    "    :param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)\n",
    "    :param db_name: Tên database trong MongoDB\n",
    "    :param collection_name: Tên collection GridFS\n",
    "    \"\"\"\n",
    "    # Kết nối đến MongoDB\n",
    "    client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "    db = client[db_name]\n",
    "    fs = gridfs.GridFS(db, collection=collection_name)\n",
    "\n",
    "    try:\n",
    "        # Chuyển đổi ID nếu cần\n",
    "        if not isinstance(file_id, ObjectId):\n",
    "            file_id = ObjectId(file_id)\n",
    "\n",
    "        # Kiểm tra file có tồn tại không\n",
    "        if fs.exists(file_id):\n",
    "            fs.delete(file_id)\n",
    "            print(f\"✅ Đã xóa file với ID: {file_id}\")\n",
    "        else:\n",
    "            print(f\"⚠️ Không tìm thấy file với ID: {file_id}\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Lỗi khi xóa file: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download file from DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_input_from_mongodb(file_id, save_name, db_name=\"excel\", collection_name=\"root_file\"):\n",
    "    os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\input\", exist_ok=True)\n",
    "\n",
    "    full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\input\", save_name)\n",
    "\n",
    "    client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "    db = client[db_name]\n",
    "    fs = gridfs.GridFS(db, collection=collection_name)\n",
    "\n",
    "    try:\n",
    "        if not isinstance(file_id, ObjectId):\n",
    "            file_id = ObjectId(file_id)\n",
    "\n",
    "        file_data = fs.get(file_id)\n",
    "        \n",
    "        with open(full_file_path, \"wb\") as f:\n",
    "            f.write(file_data.read())\n",
    "\n",
    "        print(f\"✅ File đã được tải về: {full_file_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Lỗi khi tải file: {e}\")\n",
    "    finally:\n",
    "        client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_output_from_mongodb(file_id, save_name, db_name=\"excel\", collection_name=\"root_file\"):\n",
    "    os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\output\", exist_ok=True)\n",
    "\n",
    "    full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\output\", save_name)\n",
    "\n",
    "    client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "    db = client[db_name]\n",
    "    fs = gridfs.GridFS(db, collection=collection_name)\n",
    "\n",
    "    try:\n",
    "        if not isinstance(file_id, ObjectId):\n",
    "            file_id = ObjectId(file_id)\n",
    "\n",
    "        file_data = fs.get(file_id)\n",
    "        \n",
    "        with open(full_file_path, \"wb\") as f:\n",
    "            f.write(file_data.read())\n",
    "\n",
    "        print(f\"✅ File đã được tải về: {full_file_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Lỗi khi tải file: {e}\")\n",
    "    finally:\n",
    "        client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\szero-point-ieks-essays-1350537845-9781350537842_compress.docx\n"
     ]
    }
   ],
   "source": [
    "download_input_from_mongodb(file_id=\"6843696876015abc15cc759f\", save_name=\"szero-point-ieks-essays-1350537845-9781350537842_compress.docx\", db_name=\"word\", collection_name=\"root_file\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\samsung_presentation_vietnamese.pptx\n"
     ]
    }
   ],
   "source": [
    "download_output_from_mongodb(file_id=\"684194c376015abc15cc7428\", save_name=\"samsung_presentation_vietnamese.pptx\", db_name=\"pptx\", collection_name=\"final_file\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# List all file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Đã xóa 1 file trong collection 'root_file'\n",
      "✅ Đã xóa 0 file trong collection 'final_pptx'\n",
      "✅ Đã xóa 0 file trong collection 'original_xml'\n",
      "✅ Đã xóa 0 file trong collection 'final_xml'\n"
     ]
    }
   ],
   "source": [
    "for i in ['root_file', 'final_pptx', 'original_xml', 'final_xml']:\n",
    "\n",
    "    delete_all_files_in_collection(i, db_name = 'pptx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Đã xóa 0 file trong collection 'root_file' của db 'word'\n",
      "✅ Đã xóa 8 file trong collection 'root_file' của db 'excel'\n",
      "✅ Đã xóa 0 file trong collection 'root_file' của db 'pptx'\n",
      "✅ Đã xóa 0 file trong collection 'root_file' của db 'csv'\n",
      "✅ Đã xóa 0 file trong collection 'final_file' của db 'word'\n",
      "✅ Đã xóa 7 file trong collection 'final_file' của db 'excel'\n",
      "✅ Đã xóa 0 file trong collection 'final_file' của db 'pptx'\n",
      "✅ Đã xóa 0 file trong collection 'final_file' của db 'csv'\n"
     ]
    }
   ],
   "source": [
    "for i in ['root_file', 'final_file']:\n",
    "    for j in ['word', 'excel', 'pptx', 'csv']:\n",
    "        delete_all_files_in_collection(i, db_name=j)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "total_size = 0\n",
    "\n",
    "for db_name in ['word', 'exce', 'pptx', 'csv']:\n",
    "    db = client[db_name]\n",
    "    stats = db.command(\"dbstats\")\n",
    "    db_size = stats.get(\"StorageSize\", 0)\n",
    "    total_size += db_size\n",
    "total_size / (1024** 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Đã lấy 0 dữ liệu đo lường.\n",
      "⚠️ Không có dữ liệu đo lường.\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from requests.auth import HTTPDigestAuth\n",
    "import datetime\n",
    "\n",
    "# ==== Cấu hình ====\n",
    "PUBLIC_KEY = 'uetgyqkj'\n",
    "PRIVATE_KEY = '892caec5-8474-4043-862b-f4d4c617daa2'\n",
    "GROUP_ID = '67db8bf4ed971c2114aad7f1#'         # còn gọi là Project ID\n",
    "CLUSTER_NAME = 'Cluster0'\n",
    "\n",
    "# ==== Lấy metric dung lượng dữ liệu ====\n",
    "url = f\"https://cloud.mongodb.com/api/atlas/v1.0/groups/{GROUP_ID}/clusters/{CLUSTER_NAME}/measurements\"\n",
    "\n",
    "params = {\n",
    "    \"granularity\": \"PT1M\",           # lấy theo từng phút\n",
    "    \"period\": \"PT1H\",                # 5 phút gần nhất\n",
    "    \"m\": \"DATA_SIZE_TOTAL\",          # metric cần lấy\n",
    "}\n",
    "\n",
    "response = requests.get(\n",
    "    url,\n",
    "    auth=HTTPDigestAuth(PUBLIC_KEY, PRIVATE_KEY),\n",
    "    params=params\n",
    ")\n",
    "\n",
    "# ==== Xử lý kết quả ====\n",
    "if response.status_code == 200:\n",
    "    data = response.json()\n",
    "    measurements = data.get(\"measurements\", [])\n",
    "    print(f\"✅ Đã lấy {len(measurements)} dữ liệu đo lường.\")\n",
    "    if measurements:\n",
    "        datapoints = measurements[0].get(\"dataPoints\", [])\n",
    "        if datapoints:\n",
    "            latest_point = [d for d in datapoints if d['value'] is not None][-1]\n",
    "            value_bytes = latest_point['value']\n",
    "            ts = latest_point['timestamp']\n",
    "            print(f\"✅ Dung lượng hiện tại: {value_bytes / (1024**2):.2f} MB (timestamp: {ts})\")\n",
    "        else:\n",
    "            print(\"⚠️ Không có datapoint nào.\")\n",
    "    else:\n",
    "        print(\"⚠️ Không có dữ liệu đo lường.\")\n",
    "else:\n",
    "    print(f\"❌ Lỗi {response.status_code}: {response.text}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2025-06-05 15:19:25'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import time \n",
    "now = time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
    "now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "ename": "OperationFailure",
     "evalue": "(Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }, full error: {'ok': 0, 'errmsg': '(Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }', 'code': 8000, 'codeName': 'AtlasError'}",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mOperationFailure\u001b[0m                          Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[27], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtime\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madmin\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommand\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfsync\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Force flush\u001b[39;00m\n\u001b[0;32m      4\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m1\u001b[39m)\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\_csot.py:119\u001b[0m, in \u001b[0;36mapply.<locals>.csot_wrapper\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    117\u001b[0m         \u001b[38;5;28;01mwith\u001b[39;00m _TimeoutContext(timeout):\n\u001b[0;32m    118\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m--> 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\database.py:930\u001b[0m, in \u001b[0;36mDatabase.command\u001b[1;34m(self, command, value, check, allowable_errors, read_preference, codec_options, session, comment, **kwargs)\u001b[0m\n\u001b[0;32m    925\u001b[0m     read_preference \u001b[38;5;241m=\u001b[39m (session \u001b[38;5;129;01mand\u001b[39;00m session\u001b[38;5;241m.\u001b[39m_txn_read_preference()) \u001b[38;5;129;01mor\u001b[39;00m ReadPreference\u001b[38;5;241m.\u001b[39mPRIMARY\n\u001b[0;32m    926\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39m_conn_for_reads(read_preference, session, operation\u001b[38;5;241m=\u001b[39mcommand_name) \u001b[38;5;28;01mas\u001b[39;00m (\n\u001b[0;32m    927\u001b[0m     connection,\n\u001b[0;32m    928\u001b[0m     read_preference,\n\u001b[0;32m    929\u001b[0m ):\n\u001b[1;32m--> 930\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_command(\n\u001b[0;32m    931\u001b[0m         connection,\n\u001b[0;32m    932\u001b[0m         command,\n\u001b[0;32m    933\u001b[0m         value,\n\u001b[0;32m    934\u001b[0m         check,\n\u001b[0;32m    935\u001b[0m         allowable_errors,\n\u001b[0;32m    936\u001b[0m         read_preference,\n\u001b[0;32m    937\u001b[0m         opts,  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m    938\u001b[0m         session\u001b[38;5;241m=\u001b[39msession,\n\u001b[0;32m    939\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m    940\u001b[0m     )\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\database.py:770\u001b[0m, in \u001b[0;36mDatabase._command\u001b[1;34m(self, conn, command, value, check, allowable_errors, read_preference, codec_options, write_concern, parse_write_concern_error, session, **kwargs)\u001b[0m\n\u001b[0;32m    768\u001b[0m command\u001b[38;5;241m.\u001b[39mupdate(kwargs)\n\u001b[0;32m    769\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39m_tmp_session(session) \u001b[38;5;28;01mas\u001b[39;00m s:\n\u001b[1;32m--> 770\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommand\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    771\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    772\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    773\u001b[0m \u001b[43m        \u001b[49m\u001b[43mread_preference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    774\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcodec_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    775\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    776\u001b[0m \u001b[43m        \u001b[49m\u001b[43mallowable_errors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    777\u001b[0m \u001b[43m        \u001b[49m\u001b[43mwrite_concern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwrite_concern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    778\u001b[0m \u001b[43m        \u001b[49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    779\u001b[0m \u001b[43m        \u001b[49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    780\u001b[0m \u001b[43m        \u001b[49m\u001b[43mclient\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    781\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\helpers.py:47\u001b[0m, in \u001b[0;36m_handle_reauth.<locals>.inner\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     44\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpymongo\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynchronous\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpool\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Connection\n\u001b[0;32m     46\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 47\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m     48\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m OperationFailure \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m     49\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m no_reauth:\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\pool.py:536\u001b[0m, in \u001b[0;36mConnection.command\u001b[1;34m(self, dbname, spec, read_preference, codec_options, check, allowable_errors, read_concern, write_concern, parse_write_concern_error, collation, session, client, retryable_write, publish_events, user_fields, exhaust_allowed)\u001b[0m\n\u001b[0;32m    534\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_if_not_writable(unacknowledged)\n\u001b[0;32m    535\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 536\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcommand\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    537\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    538\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdbname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    539\u001b[0m \u001b[43m        \u001b[49m\u001b[43mspec\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    540\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_mongos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    541\u001b[0m \u001b[43m        \u001b[49m\u001b[43mread_preference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    542\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcodec_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    543\u001b[0m \u001b[43m        \u001b[49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    544\u001b[0m \u001b[43m        \u001b[49m\u001b[43mclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    545\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    546\u001b[0m \u001b[43m        \u001b[49m\u001b[43mallowable_errors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    547\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maddress\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    548\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlisteners\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    549\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_bson_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    550\u001b[0m \u001b[43m        \u001b[49m\u001b[43mread_concern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    551\u001b[0m \u001b[43m        \u001b[49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    552\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcollation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcollation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    553\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompression_ctx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompression_context\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    554\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_op_msg\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mop_msg_enabled\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    555\u001b[0m \u001b[43m        \u001b[49m\u001b[43munacknowledged\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43munacknowledged\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    556\u001b[0m \u001b[43m        \u001b[49m\u001b[43muser_fields\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_fields\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    557\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexhaust_allowed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexhaust_allowed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    558\u001b[0m \u001b[43m        \u001b[49m\u001b[43mwrite_concern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwrite_concern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    559\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    560\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (OperationFailure, NotPrimaryError):\n\u001b[0;32m    561\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\network.py:213\u001b[0m, in \u001b[0;36mcommand\u001b[1;34m(conn, dbname, spec, is_mongos, read_preference, codec_options, session, client, check, allowable_errors, address, listeners, max_bson_size, read_concern, parse_write_concern_error, collation, compression_ctx, use_op_msg, unacknowledged, user_fields, exhaust_allowed, write_concern)\u001b[0m\n\u001b[0;32m    211\u001b[0m             client\u001b[38;5;241m.\u001b[39m_process_response(response_doc, session)\n\u001b[0;32m    212\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m check:\n\u001b[1;32m--> 213\u001b[0m             \u001b[43mhelpers_shared\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_command_response\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    214\u001b[0m \u001b[43m                \u001b[49m\u001b[43mresponse_doc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    215\u001b[0m \u001b[43m                \u001b[49m\u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_wire_version\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    216\u001b[0m \u001b[43m                \u001b[49m\u001b[43mallowable_errors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    217\u001b[0m \u001b[43m                \u001b[49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    218\u001b[0m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    219\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m    220\u001b[0m     duration \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mdatetime\u001b[38;5;241m.\u001b[39mnow() \u001b[38;5;241m-\u001b[39m start\n",
      "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\helpers_shared.py:247\u001b[0m, in \u001b[0;36m_check_command_response\u001b[1;34m(response, max_wire_version, allowable_errors, parse_write_concern_error)\u001b[0m\n\u001b[0;32m    244\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m43\u001b[39m:\n\u001b[0;32m    245\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CursorNotFound(errmsg, code, response, max_wire_version)\n\u001b[1;32m--> 247\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m OperationFailure(errmsg, code, response, max_wire_version)\n",
      "\u001b[1;31mOperationFailure\u001b[0m: (Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }, full error: {'ok': 0, 'errmsg': '(Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }', 'code': 8000, 'codeName': 'AtlasError'}"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "client.admin.command(\"fsync\")  # Force flush\n",
    "time.sleep(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def file_list(collection=\"root_file\"):\n",
    "    client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "    db = client[\"ppt\"]\n",
    "    fs = gridfs.GridFS(db, collection=collection)\n",
    "    for file in fs.find():\n",
    "        print(f\"📂 File: {file.filename} - ID: {file._id}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcd8c575cfef63155d3f91\n",
      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcea4f02257ad0cb04610e\n",
      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcead0143da29a5c6321ab\n",
      "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dd3bf23cf7ee2f6eca902e\n"
     ]
    }
   ],
   "source": [
    "file_list(\"root_file\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Đã xóa file với ID: 67d383681d4db191e51b0bd8\n"
     ]
    }
   ],
   "source": [
    "delete_pptx_from_mongodb(file_id='67d383681d4db191e51b0bd8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from io import BytesIO\n",
    "from openpyxl import load_workbook\n",
    "from utils.utils import translate_single_text\n",
    "from pymongo import MongoClient\n",
    "from bson import ObjectId\n",
    "import gridfs\n",
    "from utils.utils import unzip_office_file\n",
    "\n",
    "\n",
    "# 1. Kết nối tới MongoDB\n",
    "client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "db = client['excel']\n",
    "fs = gridfs.GridFS(db, collection='root_file')\n",
    "\n",
    "# 2. Tải file Excel từ MongoDB\n",
    "file_obj = fs.get(ObjectId('6836c69e59530f034bd86576'))\n",
    "excel_file = BytesIO(file_obj.read())\n",
    "\n",
    "xml_folder = unzip_office_file(r\"D:\\Show_me_everything\\Machine Translation\\input\\sample.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "from typing import List\n",
    "from utils.utils import translate_single_text\n",
    "\n",
    "def _register_all_namespaces(xml_file_path):\n",
    "    try:\n",
    "        namespaces = dict([\n",
    "            node for _, node in ET.iterparse(xml_file_path, events=['start-ns'])\n",
    "        ])\n",
    "        for prefix, uri in namespaces.items():\n",
    "            ET.register_namespace(prefix, uri)\n",
    "        if '' in namespaces and namespaces[''] == \"http://schemas.openxmlformats.org/spreadsheetml/2006/main\":\n",
    "             ET.register_namespace('', \"http://schemas.openxmlformats.org/spreadsheetml/2006/main\")\n",
    "        if 'r' not in namespaces:\n",
    "             ET.register_namespace('r', \"http://schemas.openxmlformats.org/officeDocument/2006/relationships\")\n",
    "        if 'mc' not in namespaces:\n",
    "            ET.register_namespace('mc', \"http://schemas.openxmlformats.org/markup-compatibility/2006\")\n",
    "        if 'x15' not in namespaces:\n",
    "            ET.register_namespace('x15', \"http://schemas.microsoft.com/office/spreadsheetml/2010/11/main\")\n",
    "        return namespaces # Trả về để có thể dùng trong findall nếu cần\n",
    "    except Exception as e:\n",
    "        print(f\"Lỗi khi đăng ký namespace từ {xml_file_path}: {e}\")\n",
    "        return {}\n",
    "\n",
    "def get_all_sheet_names_from_file(workbook_xml_path, source_lang='chinese', target_lang='vietnamese') -> List[str]:\n",
    "    sheet_names = {}\n",
    "    try:\n",
    "        # Phân tích trực tiếp từ file\n",
    "        tree = ET.parse(workbook_xml_path)\n",
    "        root = tree.getroot()\n",
    "\n",
    "        namespaces = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n",
    "\n",
    "        for sheet_element in root.findall('.//main:sheet', namespaces):\n",
    "            name = sheet_element.get('name')\n",
    "            if name:\n",
    "                sheet_names[name] = name\n",
    "        for k in sheet_names:\n",
    "            sheet_names[k] = translate_single_text(k, source_lang, target_lang)\n",
    "\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Lỗi: Không tìm thấy file tại '{workbook_xml_path}'\")\n",
    "    except ET.ParseError as e:\n",
    "        print(f\"Lỗi khi phân tích XML từ file '{workbook_xml_path}': {e}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Đã xảy ra lỗi không mong muốn khi xử lý file '{workbook_xml_path}': {e}\")\n",
    "\n",
    "    return sheet_names\n",
    "\n",
    "def translate_sheet_names_in_file(workbook_xml_path, source_lang = 'chinese', target_lang = 'vietnamese'):\n",
    "\n",
    "    original_to_translated_map: Dict[str, str] = {}\n",
    "    file_modified = False\n",
    "\n",
    "    try:\n",
    "        parsed_namespaces = _register_all_namespaces(workbook_xml_path)\n",
    "        tree = ET.parse(workbook_xml_path)\n",
    "        root = tree.getroot()\n",
    "        search_namespaces = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n",
    "        if '' in parsed_namespaces and parsed_namespaces[''] == search_namespaces['main']:\n",
    "            pass # search_namespaces đã ổn\n",
    "\n",
    "\n",
    "        # Bước 3: Lặp qua các sheet, dịch và cập nhật tên\n",
    "        for sheet_element in root.findall('.//main:sheet', search_namespaces):\n",
    "            original_name = sheet_element.get('name')\n",
    "            if original_name and original_name.strip(): # Chỉ dịch nếu tên có nội dung\n",
    "                try:\n",
    "                    # Gọi hàm dịch của bạn\n",
    "                    translated_name = translate_single_text(original_name, source_lang, target_lang)\n",
    "                    os.wait(5)\n",
    "\n",
    "                    if translated_name and translated_name.strip() and translated_name != original_name:\n",
    "                        # Cập nhật thuộc tính 'name' của element trong cây XML\n",
    "                        sheet_element.set('name', translated_name)\n",
    "                        original_to_translated_map[original_name] = translated_name\n",
    "                        file_modified = True\n",
    "                        print(f\"Đã dịch sheet: '{original_name}' -> '{translated_name}'\")\n",
    "                    else:\n",
    "                        # Nếu dịch thất bại hoặc không thay đổi, giữ lại tên gốc trong map\n",
    "                        original_to_translated_map[original_name] = original_name\n",
    "                        if translated_name and translated_name != original_name :\n",
    "                             print(f\"Bản dịch cho '{original_name}' trống hoặc không hợp lệ, không cập nhật XML.\")\n",
    "                        elif not translated_name:\n",
    "                             print(f\"Dịch thất bại cho '{original_name}', không cập nhật XML.\")\n",
    "\n",
    "\n",
    "                except Exception as e_translate:\n",
    "                    print(f\"Lỗi khi dịch tên sheet '{original_name}': {e_translate}\")\n",
    "                    original_to_translated_map[original_name] = original_name # Ghi nhận lỗi, giữ tên gốc\n",
    "\n",
    "        # Bước 4: Nếu có thay đổi, ghi lại toàn bộ cây XML vào file\n",
    "        if file_modified:\n",
    "            # encoding='utf-8' và xml_declaration=True là quan trọng\n",
    "            tree.write(workbook_xml_path, encoding='utf-8', xml_declaration=True)\n",
    "            print(f\"Đã cập nhật thành công file: {workbook_xml_path}\")\n",
    "        else:\n",
    "            print(f\"Không có tên sheet nào được thay đổi trong file: {workbook_xml_path}\")\n",
    "\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Lỗi: Không tìm thấy file tại '{workbook_xml_path}'\")\n",
    "    except ET.ParseError as e:\n",
    "        print(f\"Lỗi khi phân tích XML từ file '{workbook_xml_path}': {e}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Đã xảy ra lỗi không mong muốn khi xử lý file '{workbook_xml_path}': {e}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "\n",
    "    return original_to_translated_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Đã dịch sheet: '流程图' -> 'Lưu đồ'\n",
      "Đã dịch sheet: '分割光源板' -> 'Tấm dẫn sáng phân đoạn'\n",
      "Đã dịch sheet: '支架外观检验' -> 'Kiểm tra ngoại quan giá đỡ'\n",
      "Đã dịch sheet: '固定驱动' -> 'Ổ đĩa cố định'\n",
      "Đã dịch sheet: '固定接地线' -> 'Dây nối đất cố định'\n",
      "Đã dịch sheet: '整理电源线' -> 'Sắp xếp dây nguồn'\n",
      "Đã dịch sheet: '固定光源板' -> 'Bảng nguồn sáng cố định'\n",
      "Đã dịch sheet: '连接光源板' -> 'Kết nối bảng nguồn sáng'\n",
      "Đã dịch sheet: '焊接光源板' -> 'Hàn bảng nguồn sáng'\n",
      "Đã dịch sheet: '连接驱动' -> 'Kết nối trình điều khiển'\n",
      "Đã dịch sheet: '安装端头 ' -> 'Lắp đặt đầu cuối'\n",
      "Đã dịch sheet: '试 亮' -> 'Thử Sáng'\n",
      "Đã dịch sheet: '绝缘、接地 ' -> 'Cách điện, tiếp đất'\n",
      "Đã dịch sheet: '安装透光罩 (2)' -> 'Lắp chụp đèn (2)'\n",
      "Đã dịch sheet: '老练' -> 'Lão luyện'\n",
      "Đã dịch sheet: '二次试亮' -> 'Thử sáng lần hai'\n",
      "Đã cập nhật thành công file: D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'流程图': 'Lưu đồ',\n",
       " '分割光源板': 'Tấm dẫn sáng phân đoạn',\n",
       " '支架外观检验': 'Kiểm tra ngoại quan giá đỡ',\n",
       " '固定驱动': 'Ổ đĩa cố định',\n",
       " '固定接地线': 'Dây nối đất cố định',\n",
       " '整理电源线': 'Sắp xếp dây nguồn',\n",
       " '固定光源板': 'Bảng nguồn sáng cố định',\n",
       " '连接光源板': 'Kết nối bảng nguồn sáng',\n",
       " '焊接光源板': 'Hàn bảng nguồn sáng',\n",
       " '连接驱动': 'Kết nối trình điều khiển',\n",
       " '安装端头 ': 'Lắp đặt đầu cuối',\n",
       " '试 亮': 'Thử Sáng',\n",
       " '绝缘、接地 ': 'Cách điện, tiếp đất',\n",
       " '安装透光罩 (2)': 'Lắp chụp đèn (2)',\n",
       " '老练': 'Lão luyện',\n",
       " '二次试亮': 'Thử sáng lần hai'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "translate_sheet_names_in_file(r'D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'流程图': 'Lưu đồ',\n",
       " '分割光源板': 'Tấm dẫn sáng phân đoạn',\n",
       " '支架外观检验': 'Kiểm tra ngoại quan của giá đỡ',\n",
       " '固定驱动': 'Ổ đĩa cố định',\n",
       " '固定接地线': 'Dây nối đất cố định',\n",
       " '整理电源线': 'Sắp xếp dây nguồn',\n",
       " '固定光源板': 'Tấm nền nguồn sáng cố định',\n",
       " '连接光源板': 'Kết nối bảng nguồn sáng',\n",
       " '焊接光源板': 'Hàn bảng nguồn sáng',\n",
       " '连接驱动': 'Kết nối trình điều khiển',\n",
       " '安装端头 ': 'Lắp đặt đầu cuối',\n",
       " '试 亮': 'Thử Lượng',\n",
       " '绝缘、接地 ': 'Cách điện, tiếp đất',\n",
       " '安装透光罩 (2)': 'Lắp đặt chụp đèn trong suốt (2)',\n",
       " '老练': 'Lão luyện',\n",
       " '二次试亮': 'Thử sáng lần hai'}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sheets_name_d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from typing import Dict, Callable\n",
    "# Giả sử utils.utils.translate_single_text đã được import và hoạt động đúng\n",
    "# từ file trước: from utils.utils import translate_single_text\n",
    "\n",
    "def translate_sheet_names_via_regex(\n",
    "    workbook_xml_path: str,\n",
    "    source_lang: str = 'chinese',\n",
    "    target_lang: str = 'vietnamese'\n",
    ") -> (Dict[str, str], bool):\n",
    "\n",
    "    original_to_translated_map: Dict[str, str] = {}\n",
    "    modified_content: str = \"\"\n",
    "    file_changed_flag: bool = False # Sử dụng tên biến rõ ràng hơn\n",
    "\n",
    "    try:\n",
    "        with open(workbook_xml_path, 'r', encoding='utf-8') as f:\n",
    "            content = f.read()\n",
    "\n",
    "        current_content = content\n",
    "\n",
    "        def replace_name_callback(match_obj):\n",
    "            nonlocal file_changed_flag # Để sửa đổi biến bên ngoài\n",
    "            nonlocal original_to_translated_map\n",
    "\n",
    "            attr_prefix = match_obj.group(1)  # Ví dụ: '<sheet other_attr=\"foo\" name='\n",
    "            opening_quote = match_obj.group(2) # Ví dụ: '\"'\n",
    "            original_name_xml_encoded = match_obj.group(3)\n",
    "            attr_suffix = match_obj.group(5)  # Ví dụ: ' sheetId=\"12\" r:id=\"rId1\"/>'\n",
    "\n",
    "            original_name = original_name_xml_encoded # Tạm thời bỏ qua unescape/escape cho đơn giản ví dụ\n",
    "\n",
    "            if not original_name.strip():\n",
    "                return match_obj.group(0) # Trả về chuỗi gốc nếu tên rỗng\n",
    "\n",
    "            translated_name = original_name # Mặc định giữ nguyên\n",
    "            if original_name in original_to_translated_map and original_to_translated_map[original_name] != original_name:\n",
    "                translated_name = original_to_translated_map[original_name]\n",
    "                # Nếu đã dịch và có thay đổi, không cần gọi API dịch nữa\n",
    "                if translated_name != original_name: # Cần kiểm tra lại vì map có thể lưu tên gốc nếu dịch lỗi\n",
    "                     print(f\"Regex: Sử dụng bản dịch đã có cho '{original_name}' -> '{translated_name}'\")\n",
    "                     file_changed_flag = True # Đảm bảo cờ được set nếu sử dụng bản dịch đã có mà khác gốc\n",
    "            else:\n",
    "                try:\n",
    "                    translated_name_raw = translate_single_text(original_name, source_lang, target_lang)\n",
    "\n",
    "                    if translated_name_raw and translated_name_raw.strip() and translated_name_raw != original_name:\n",
    "                        translated_name = translated_name_raw[:31]\n",
    "                        original_to_translated_map[original_name] = translated_name\n",
    "                        file_changed_flag = True\n",
    "                        print(f\"Regex: Đã dịch sheet: '{original_name}' -> '{translated_name}'\")\n",
    "                    else:\n",
    "                        original_to_translated_map[original_name] = original_name # Lưu tên gốc nếu dịch lỗi/không đổi\n",
    "                        # translated_name vẫn là original_name\n",
    "                        if translated_name_raw and translated_name_raw.strip() and translated_name_raw == original_name:\n",
    "                            print(f\"Bản dịch cho '{original_name}' giống hệt bản gốc, không thay đổi.\")\n",
    "                        elif not (translated_name_raw and translated_name_raw.strip()):\n",
    "                             print(f\"Bản dịch cho '{original_name}' trống hoặc không hợp lệ, giữ nguyên.\")\n",
    "\n",
    "                except Exception as e_translate:\n",
    "                    print(f\"Lỗi khi gọi hàm dịch cho '{original_name}': {e_translate}\")\n",
    "                    original_to_translated_map[original_name] = original_name\n",
    "\n",
    "            translated_name_xml_encoded = translated_name # Tạm thời bỏ qua escape\n",
    "\n",
    "            return f\"{attr_prefix}{opening_quote}{translated_name_xml_encoded}{opening_quote}{attr_suffix}\"\n",
    "\n",
    "        sheet_name_pattern = re.compile(\n",
    "            r'(<sheet[^>]*?\\sname=)([\"\\'])((?:(?!\\2).)*?)(\\2)([^>]*?>)'\n",
    "        )\n",
    "        modified_content = sheet_name_pattern.sub(replace_name_callback, current_content)\n",
    "\n",
    "        if file_changed_flag:\n",
    "            with open(workbook_xml_path, 'w', encoding='utf-8') as f:\n",
    "                f.write(modified_content)\n",
    "            print(f\"Regex: Đã cập nhật thành công file: {workbook_xml_path}\")\n",
    "            return original_to_translated_map, True\n",
    "        else:\n",
    "            print(f\"Regex: Không có tên sheet nào được thay đổi trong file: {workbook_xml_path}\")\n",
    "\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Lỗi: Không tìm thấy file tại '{workbook_xml_path}'\")\n",
    "    except Exception as e:\n",
    "        print(f\"Đã xảy ra lỗi không mong muốn khi xử lý file '{workbook_xml_path}' bằng regex: {e}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Regex: Đã dịch sheet: '流程图' -> 'Lưu đồ'\n",
      "Regex: Đã dịch sheet: '分割光源板' -> 'Tấm dẫn sáng phân đoạn'\n",
      "Regex: Đã dịch sheet: '支架外观检验' -> 'Kiểm tra ngoại quan giá đỡ'\n",
      "Regex: Đã dịch sheet: '固定驱动' -> 'Ổ đĩa cố định'\n",
      "Regex: Đã dịch sheet: '固定接地线' -> 'Dây tiếp đất cố định'\n",
      "Regex: Đã dịch sheet: '整理电源线' -> 'Sắp xếp dây nguồn'\n",
      "Regex: Đã dịch sheet: '固定光源板' -> 'Bảng nguồn sáng cố định'\n",
      "Regex: Đã dịch sheet: '连接光源板' -> 'Kết nối bảng đèn nền'\n",
      "Regex: Đã dịch sheet: '焊接光源板' -> 'Bảng nguồn sáng hàn'\n",
      "Regex: Đã dịch sheet: '连接驱动' -> 'Kết nối trình điều khiển'\n",
      "Regex: Đã dịch sheet: '安装端头 ' -> 'Lắp đặt đầu cuối'\n",
      "Regex: Đã dịch sheet: '试 亮' -> 'Thử Sáng'\n",
      "Regex: Đã dịch sheet: '绝缘、接地 ' -> 'Cách điện, tiếp đất'\n",
      "Regex: Đã dịch sheet: '安装透光罩 (2)' -> 'Lắp chụp đèn (2)'\n",
      "Regex: Đã dịch sheet: '老练' -> 'Lão luyện'\n",
      "Regex: Đã dịch sheet: '二次试亮' -> 'Thử sáng lần hai'\n",
      "Regex: Đã cập nhật thành công file: D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml\n"
     ]
    }
   ],
   "source": [
    "translation_map_regex, success_regex = translate_sheet_names_via_regex(r'D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n",
    "<workbook xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\" xmlns:mc=\"http://schemas.openxmlformats.org/markup-compatibility/2006\" mc:Ignorable=\"x15 xr xr6 xr10 xr2\" xmlns:x15=\"http://schemas.microsoft.com/office/spreadsheetml/2010/11/main\" xmlns:xr=\"http://schemas.microsoft.com/office/spreadsheetml/2014/revision\" xmlns:xr6=\"http://schemas.microsoft.com/office/spreadsheetml/2016/revision6\" xmlns:xr10=\"http://schemas.microsoft.com/office/spreadsheetml/2016/revision10\" xmlns:xr2=\"http://schemas.microsoft.com/office/spreadsheetml/2015/revision2\"><fileVersion appName=\"xl\" lastEdited=\"7\" lowestEdited=\"5\" rupBuild=\"28324\"/><workbookPr codeName=\"ThisWorkbook\"/><mc:AlternateContent xmlns:mc=\"http://schemas.openxmlformats.org/markup-compatibility/2006\"><mc:Choice Requires=\"x15\"><x15ac:absPath url=\"G:\\Machine_Learning\\machine_translation\\temp\\\" xmlns:x15ac=\"http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac\"/></mc:Choice></mc:AlternateContent><xr:revisionPtr revIDLastSave=\"0\" documentId=\"13_ncr:1_{0BB03666-2754-4098-B7E4-88837ADAC82E}\" xr6:coauthVersionLast=\"47\" xr6:coauthVersionMax=\"47\" xr10:uidLastSave=\"{00000000-0000-0000-0000-000000000000}\"/><bookViews><workbookView xWindow=\"-110\" yWindow=\"-110\" windowWidth=\"25820\" windowHeight=\"15500\" firstSheet=\"15\" activeTab=\"15\" xr2:uid=\"{00000000-000D-0000-FFFF-FFFF00000000}\"/></bookViews><sheets><sheet name=\"流程图\" sheetId=\"12\" r:id=\"rId1\"/><sheet name=\"分割光源板\" sheetId=\"2\" r:id=\"rId2\"/><sheet name=\"支架外观检验\" sheetId=\"24\" r:id=\"rId3\"/><sheet name=\"固定驱动\" sheetId=\"21\" r:id=\"rId4\"/><sheet name=\"固定接地线\" sheetId=\"4\" r:id=\"rId5\"/><sheet name=\"整理电源线\" sheetId=\"22\" r:id=\"rId6\"/><sheet name=\"固定光源板\" sheetId=\"3\" r:id=\"rId7\"/><sheet name=\"连接光源板\" sheetId=\"5\" r:id=\"rId8\"/><sheet name=\"焊接光源板\" sheetId=\"29\" r:id=\"rId9\"/><sheet name=\"连接驱动\" sheetId=\"30\" r:id=\"rId10\"/><sheet name=\"安装端头 \" sheetId=\"31\" r:id=\"rId11\"/><sheet name=\"试 亮\" sheetId=\"20\" r:id=\"rId12\"/><sheet name=\"绝缘、接地 \" sheetId=\"9\" r:id=\"rId13\"/><sheet name=\"安装透光罩 (2)\" sheetId=\"32\" state=\"hidden\" r:id=\"rId14\"/><sheet name=\"老练\" sheetId=\"10\" r:id=\"rId15\"/><sheet name=\"二次试亮\" sheetId=\"6\" r:id=\"rId16\"/></sheets><definedNames><definedName name=\"_xlnm.Print_Area\" localSheetId=\"15\">二次试亮!$A$1:$L$39</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"1\">分割光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"6\">固定光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"4\">固定接地线!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"3\">固定驱动!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"10\">'安装端头 '!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"5\">整理电源线!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"0\">流程图!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"8\">焊接光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"12\">'绝缘、接地 '!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"14\">老练!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"11\">'试 亮'!$A$1:$L$39</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"7\">连接光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"9\">连接驱动!$A$1:$L$40</definedName></definedNames><calcPr calcId=\"191029\"/><extLst><ext uri=\"{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}\" xmlns:xcalcf=\"http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures\"><xcalcf:calcFeatures><xcalcf:feature name=\"microsoft.com:RD\"/><xcalcf:feature name=\"microsoft.com:Single\"/><xcalcf:feature name=\"microsoft.com:FV\"/><xcalcf:feature name=\"microsoft.com:CNMTM\"/><xcalcf:feature name=\"microsoft.com:LET_WF\"/><xcalcf:feature name=\"microsoft.com:LAMBDA_WF\"/><xcalcf:feature name=\"microsoft.com:ARRAYTEXT_WF\"/></xcalcf:calcFeatures></ext></extLst></workbook>\n",
    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n",
    "<workbook xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\" xmlns:mc=\"http://schemas.openxmlformats.org/markup-compatibility/2006\" mc:Ignorable=\"x15 xr xr6 xr10 xr2\" xmlns:x15=\"http://schemas.microsoft.com/office/spreadsheetml/2010/11/main\" xmlns:xr=\"http://schemas.microsoft.com/office/spreadsheetml/2014/revision\" xmlns:xr6=\"http://schemas.microsoft.com/office/spreadsheetml/2016/revision6\" xmlns:xr10=\"http://schemas.microsoft.com/office/spreadsheetml/2016/revision10\" xmlns:xr2=\"http://schemas.microsoft.com/office/spreadsheetml/2015/revision2\"><fileVersion appName=\"xl\" lastEdited=\"7\" lowestEdited=\"5\" rupBuild=\"28324\"/><workbookPr codeName=\"ThisWorkbook\"/><mc:AlternateContent xmlns:mc=\"http://schemas.openxmlformats.org/markup-compatibility/2006\"><mc:Choice Requires=\"x15\"><x15ac:absPath url=\"G:\\Machine_Learning\\machine_translation\\temp\\\" xmlns:x15ac=\"http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac\"/></mc:Choice></mc:AlternateContent><xr:revisionPtr revIDLastSave=\"0\" documentId=\"13_ncr:1_{0BB03666-2754-4098-B7E4-88837ADAC82E}\" xr6:coauthVersionLast=\"47\" xr6:coauthVersionMax=\"47\" xr10:uidLastSave=\"{00000000-0000-0000-0000-000000000000}\"/><bookViews><workbookView xWindow=\"-110\" yWindow=\"-110\" windowWidth=\"25820\" windowHeight=\"15500\" firstSheet=\"15\" activeTab=\"15\" xr2:uid=\"{00000000-000D-0000-FFFF-FFFF00000000}\"/></bookViews><sheets><sheet name=\"Lưu đồ\" sheetId=\"12\" r:id=\"rId1\"/><sheet name=\"Tấm dẫn sáng phân đoạn\" sheetId=\"2\" r:id=\"rId2\"/><sheet name=\"Kiểm tra ngoại quan giá đỡ\" sheetId=\"24\" r:id=\"rId3\"/><sheet name=\"Ổ đĩa cố định\" sheetId=\"21\" r:id=\"rId4\"/><sheet name=\"Dây tiếp đất cố định\" sheetId=\"4\" r:id=\"rId5\"/><sheet name=\"Sắp xếp dây nguồn\" sheetId=\"22\" r:id=\"rId6\"/><sheet name=\"Tấm nền nguồn sáng cố định\" sheetId=\"3\" r:id=\"rId7\"/><sheet name=\"Kết nối bảng đèn.\" sheetId=\"5\" r:id=\"rId8\"/><sheet name=\"Hàn bảng nguồn sáng\" sheetId=\"29\" r:id=\"rId9\"/><sheet name=\"Kết nối trình điều khiển\" sheetId=\"30\" r:id=\"rId10\"/><sheet name=\"Lắp đặt đầu cuối\" sheetId=\"31\" r:id=\"rId11\"/><sheet name=\"Thử Sáng\" sheetId=\"20\" r:id=\"rId12\"/><sheet name=\"Cách điện, tiếp đất\" sheetId=\"9\" r:id=\"rId13\"/><sheet name=\"Lắp chụp đèn (2)\" sheetId=\"32\" state=\"hidden\" r:id=\"rId14\"/><sheet name=\"Lão luyện\" sheetId=\"10\" r:id=\"rId15\"/><sheet name=\"Thử sáng lần hai\" sheetId=\"6\" r:id=\"rId16\"/></sheets><definedNames><definedName name=\"_xlnm.Print_Area\" localSheetId=\"15\">二次试亮!$A$1:$L$39</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"1\">分割光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"6\">固定光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"4\">固定接地线!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"3\">固定驱动!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"10\">'安装端头 '!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"5\">整理电源线!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"0\">流程图!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"8\">焊接光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"12\">'绝缘、接地 '!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"14\">老练!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"11\">'试 亮'!$A$1:$L$39</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"7\">连接光源板!$A$1:$L$40</definedName><definedName name=\"_xlnm.Print_Area\" localSheetId=\"9\">连接驱动!$A$1:$L$40</definedName></definedNames><calcPr calcId=\"191029\"/><extLst><ext uri=\"{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}\" xmlns:xcalcf=\"http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures\"><xcalcf:calcFeatures><xcalcf:feature name=\"microsoft.com:RD\"/><xcalcf:feature name=\"microsoft.com:Single\"/><xcalcf:feature name=\"microsoft.com:FV\"/><xcalcf:feature name=\"microsoft.com:CNMTM\"/><xcalcf:feature name=\"microsoft.com:LET_WF\"/><xcalcf:feature name=\"microsoft.com:LAMBDA_WF\"/><xcalcf:feature name=\"microsoft.com:ARRAYTEXT_WF\"/></xcalcf:calcFeatures></ext></extLst></workbook>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "26"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str = 'Kiểm tra ngoại quan giá đỡ'\n",
    "len(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import zipfile\n",
    "import copy\n",
    "import time\n",
    "import xml.etree.ElementTree as ET\n",
    "from typing import List, Dict, Any, Optional, Tuple\n",
    "from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text\n",
    "from pymongo import MongoClient\n",
    "import gridfs\n",
    "from io import BytesIO\n",
    "import shutil\n",
    "import io\n",
    "import re\n",
    "from typing import Dict\n",
    "\n",
    "\n",
    "NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n",
    "NS_DRAWING = {'xdr': \"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing\"}\n",
    "NS_A = {'a': \"http://schemas.openxmlformats.org/drawingml/2006/main\"}\n",
    "\n",
    "# --- Hàm đăng ký namespace (quan trọng khi ghi file) ---\n",
    "def register_namespaces(xml_file):\n",
    "    \"\"\"Đọc và đăng ký các namespace từ file XML.\"\"\"\n",
    "    namespaces = dict([\n",
    "        node for _, node in ET.iterparse(xml_file, events=['start-ns'])\n",
    "    ])\n",
    "    for ns, uri in namespaces.items():\n",
    "        ET.register_namespace(ns, uri)\n",
    "\n",
    "    # Đăng ký thêm namespace phổ biến nếu chưa có\n",
    "    if 'main' not in namespaces and '' not in namespaces and NS_MAIN['main'] not in namespaces.values():\n",
    "         ET.register_namespace('', NS_MAIN['main'])\n",
    "    elif 'main' not in namespaces and NS_MAIN['main'] not in namespaces.values():\n",
    "         ET.register_namespace('main', NS_MAIN['main'])\n",
    "\n",
    "    # Đăng ký namespaces cho drawing nếu cần\n",
    "    if 'xdr' not in namespaces and NS_DRAWING['xdr'] not in namespaces.values():\n",
    "        ET.register_namespace('xdr', NS_DRAWING['xdr'])\n",
    "    if 'a' not in namespaces and NS_A['a'] not in namespaces.values():\n",
    "        ET.register_namespace('a', NS_A['a'])\n",
    "\n",
    "\n",
    "def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:\n",
    "    \"\"\"\n",
    "    Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text,\n",
    "    bao gồm cả text từ TextBoxes trong drawings.\n",
    "    \"\"\"\n",
    "    modifiable_nodes = []\n",
    "    shared_strings_path = os.path.join(unzipped_folder_path, \"xl\", \"sharedStrings.xml\")\n",
    "    worksheets_folder = os.path.join(unzipped_folder_path, \"xl\", \"worksheets\")\n",
    "    drawings_folder = os.path.join(unzipped_folder_path, \"xl\", \"drawings\") # Thêm dòng này\n",
    "\n",
    "    shared_tree = None\n",
    "    sheet_trees = {}\n",
    "    drawing_trees = {} # Thêm dòng này\n",
    "\n",
    "    # --- Xử lý sharedStrings.xml ---\n",
    "    if os.path.exists(shared_strings_path):\n",
    "        try:\n",
    "            register_namespaces(shared_strings_path) # Đảm bảo register_namespaces được gọi\n",
    "            shared_tree = ET.parse(shared_strings_path)\n",
    "            root_shared = shared_tree.getroot()\n",
    "\n",
    "            for si_element in root_shared.findall('main:si', NS_MAIN):\n",
    "                text_parts = []\n",
    "                # Tìm tất cả <t> con, bất kể chúng nằm trong <r> hay không\n",
    "                t_elements = si_element.findall('.//main:t', NS_MAIN)\n",
    "\n",
    "                first_r = si_element.find('./main:r', NS_MAIN)\n",
    "                first_rpr_clone = None\n",
    "                is_rich_text = first_r is not None # Rich text nếu có ít nhất một <r>\n",
    "\n",
    "                if is_rich_text:\n",
    "                    # Cố gắng tìm <rPr> bên trong <r> đầu tiên\n",
    "                    first_rpr_candidate = si_element.find('./main:r/main:rPr', NS_MAIN)\n",
    "                    if first_rpr_candidate is not None:\n",
    "                        first_rpr_clone = copy.deepcopy(first_rpr_candidate)\n",
    "                    else:\n",
    "                        # Nếu <r> đầu tiên không có <rPr>, kiểm tra <si><rPh><rPr> (Phonetic properties, ít gặp hơn)\n",
    "                        # Hoặc có thể không có định dạng nào cụ thể ở run đầu\n",
    "                        pass\n",
    "\n",
    "\n",
    "                for t_node in t_elements:\n",
    "                    if t_node.text:\n",
    "                        text_parts.append(t_node.text)\n",
    "                full_text = \"\".join(text_parts)\n",
    "\n",
    "                if not full_text or full_text.isspace(): continue\n",
    "\n",
    "                # Logic xác định type dựa trên sự hiện diện của <r> và <rPr> đã được điều chỉnh\n",
    "                if is_rich_text : # Chỉ cần có <r> là đủ, first_rpr_clone có thể là None\n",
    "                    modifiable_nodes.append({\n",
    "                        'type': 'shared_rich',\n",
    "                        'original_text': full_text,\n",
    "                        'element': si_element,\n",
    "                        'first_format': first_rpr_clone, # Sẽ là None nếu <r> đầu không có <rPr>\n",
    "                        'source_file': os.path.join(\"xl\", \"sharedStrings.xml\"),\n",
    "                        'sheet_name': None\n",
    "                    })\n",
    "                elif t_elements:\n",
    "                    direct_t = si_element.find('./main:t', NS_MAIN)\n",
    "                    if direct_t is not None:\n",
    "                         modifiable_nodes.append({\n",
    "                            'type': 'shared_simple',\n",
    "                            'original_text': full_text,\n",
    "                            'element': direct_t, # Tham chiếu <t>\n",
    "                            'first_format': None,\n",
    "                            'source_file': os.path.join(\"xl\", \"sharedStrings.xml\"),\n",
    "                            'sheet_name': None\n",
    "                        })\n",
    "                    # else: ít khả năng xảy ra nếu t_elements có phần tử\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"Lỗi xử lý sharedStrings: {e}\")\n",
    "            import traceback\n",
    "            traceback.print_exc()\n",
    "\n",
    "\n",
    "    # --- Xử lý các file sheetX.xml (Inline Strings) ---\n",
    "    if os.path.isdir(worksheets_folder):\n",
    "        for sheet_filename in sorted(os.listdir(worksheets_folder)):\n",
    "             if sheet_filename.lower().endswith(\".xml\"):\n",
    "                sheet_file_path = os.path.join(worksheets_folder, sheet_filename)\n",
    "                try:\n",
    "                    register_namespaces(sheet_file_path) # Đảm bảo register_namespaces được gọi\n",
    "                    sheet_tree = ET.parse(sheet_file_path)\n",
    "                    sheet_trees[sheet_filename] = sheet_tree\n",
    "                    root_sheet = sheet_tree.getroot()\n",
    "                    for cell in root_sheet.findall('.//main:c[@t=\"inlineStr\"]', NS_MAIN):\n",
    "                        t_element = cell.find('.//main:is/main:t', NS_MAIN) # Sửa lại tìm kiếm <t>\n",
    "                        if t_element is not None and t_element.text is not None and t_element.text.strip():\n",
    "                             modifiable_nodes.append({\n",
    "                                'type': 'inline',\n",
    "                                'original_text': t_element.text,\n",
    "                                'element': t_element,\n",
    "                                'first_format': None,\n",
    "                                'source_file': os.path.join(\"xl\", \"worksheets\", sheet_filename),\n",
    "                                'sheet_name': sheet_filename\n",
    "                             })\n",
    "                except Exception as e:\n",
    "                     print(f\"Lỗi xử lý sheet {sheet_filename}: {e}\")\n",
    "                     import traceback\n",
    "                     traceback.print_exc()\n",
    "    else:\n",
    "        print(f\"Cảnh báo: Không tìm thấy thư mục worksheets: {worksheets_folder}\")\n",
    "\n",
    "\n",
    "    # --- Xử lý các file drawingX.xml (Text Boxes, Shapes with Text) ---\n",
    "    if os.path.isdir(drawings_folder):\n",
    "        for drawing_filename in sorted(os.listdir(drawings_folder)):\n",
    "            if drawing_filename.lower().endswith(\".xml\"):\n",
    "                drawing_file_path = os.path.join(drawings_folder, drawing_filename)\n",
    "                try:\n",
    "                    register_namespaces(drawing_file_path) # Đảm bảo register_namespaces được gọi\n",
    "                    drawing_tree = ET.parse(drawing_file_path)\n",
    "                    drawing_trees[drawing_filename] = drawing_tree\n",
    "                    root_drawing = drawing_tree.getroot()\n",
    "\n",
    "                    # TextBoxes và Shapes có text thường nằm trong <xdr:sp> (shape) -> <xdr:txBody> (text body)\n",
    "                    # Bên trong <xdr:txBody> là các <a:p> (paragraph)\n",
    "                    for p_element in root_drawing.findall('.//xdr:txBody/a:p', {**NS_DRAWING, **NS_A}):\n",
    "                        text_parts = []\n",
    "                        # Lấy text từ tất cả <a:t> trong paragraph này\n",
    "                        t_elements = p_element.findall('.//a:t', NS_A)\n",
    "\n",
    "                        first_r = p_element.find('./a:r', NS_A) # Tìm <a:r> con trực tiếp đầu tiên của <a:p>\n",
    "                        first_rpr_clone = None # Định dạng của run đầu tiên trong paragraph\n",
    "\n",
    "                        is_rich_text_paragraph = first_r is not None # Coi là rich nếu có <a:r>\n",
    "\n",
    "                        if is_rich_text_paragraph:\n",
    "                            # Tìm <a:rPr> bên trong <a:r> đầu tiên của <a:p>\n",
    "                            first_rpr = first_r.find('./a:rPr', NS_A)\n",
    "                            if first_rpr is not None:\n",
    "                                first_rpr_clone = copy.deepcopy(first_rpr)\n",
    "\n",
    "                        for t_node in t_elements:\n",
    "                            if t_node.text:\n",
    "                                text_parts.append(t_node.text)\n",
    "                        full_text = \"\".join(text_parts)\n",
    "\n",
    "                        if not full_text or full_text.isspace(): continue\n",
    "\n",
    "                        # Lưu node là <a:p> vì chúng ta sẽ thay thế toàn bộ nội dung của nó\n",
    "                        # (các <a:r> và <a:t> bên trong)\n",
    "                        modifiable_nodes.append({\n",
    "                            'type': 'drawing_text', # Loại mới cho text trong drawing\n",
    "                            'original_text': full_text,\n",
    "                            'element': p_element,      # Tham chiếu đến <a:p>\n",
    "                            'first_format': first_rpr_clone, # Lưu định dạng <a:rPr> của <a:r> đầu tiên (hoặc None)\n",
    "                            'source_file': os.path.join(\"xl\", \"drawings\", drawing_filename),\n",
    "                            'sheet_name': None # Có thể tìm cách liên kết ngược lại sheet nếu cần\n",
    "                        })\n",
    "                except Exception as e:\n",
    "                    print(f\"Lỗi xử lý drawing {drawing_filename}: {e}\")\n",
    "                    import traceback\n",
    "                    traceback.print_exc()\n",
    "    else:\n",
    "        print(f\"Thông tin: Không tìm thấy thư mục drawings: {drawings_folder}\")\n",
    "\n",
    "\n",
    "    global_data = {\n",
    "        \"shared_tree\": shared_tree,\n",
    "        \"sheet_trees\": sheet_trees,\n",
    "        \"drawing_trees\": drawing_trees, # Thêm dòng này\n",
    "        \"shared_strings_path\": shared_strings_path,\n",
    "        \"worksheets_folder\": worksheets_folder,\n",
    "        \"drawings_folder\": drawings_folder # Thêm dòng này\n",
    "    }\n",
    "    return modifiable_nodes, global_data\\\n",
    "\n",
    "\n",
    "\n",
    "def zip_folder_to_excel_file(folder_path, file_name):\n",
    "    try:\n",
    "        # Nén thư mục thành file .xlsx trong RAM\n",
    "        xlsx_buffer = io.BytesIO()\n",
    "        with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    "            for root, _, files in os.walk(folder_path):\n",
    "                for file in files:\n",
    "                    file_path = os.path.join(root, file)\n",
    "                    archive_path = os.path.relpath(file_path, folder_path)\n",
    "                    zipf.write(file_path, archive_path)\n",
    "\n",
    "        xlsx_buffer.seek(0)\n",
    "\n",
    "        client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "        db = client['excel']  \n",
    "        fs = gridfs.GridFS(db, collection='final_file')\n",
    "\n",
    "        file_id = fs.put(xlsx_buffer.read(), filename=file_name)\n",
    "        print(f\"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}\")\n",
    "        return file_id\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}\")\n",
    "        return None\n",
    "    \n",
    "\n",
    "\n",
    "def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):\n",
    "    \n",
    "    client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n",
    "    db = client['excel']\n",
    "    fs = gridfs.GridFS(db, collection='root_file')\n",
    "    \n",
    "    ppt_file = fs.get(file_id)\n",
    "    excel_file = BytesIO(ppt_file.read())\n",
    "\n",
    "    xml_folder = unzip_office_file(excel_file)\n",
    "    path_to_workbook_xml = os.path.join(xml_folder, \"xl\", \"workbook.xml\")\n",
    "    translate_sheet_names_via_regex(path_to_workbook_xml, source_lang, target_lang)\n",
    "\n",
    "    modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)\n",
    "\n",
    "    original_texts = get_text_list_from_nodes(modifiable_nodes)\n",
    "\n",
    "    all_results = [None] * len(original_texts)\n",
    "    current_index = 0\n",
    "    processed_count = 0\n",
    "    api_call_counter = 0 # Track API calls for delay logic\n",
    "\n",
    "    while current_index < len(original_texts):\n",
    "        batch_texts_to_translate = []\n",
    "        batch_original_indices = [] # 0-based indices for assignment\n",
    "        batch_end_index = min(current_index + batch_size_segments, len(original_texts))\n",
    "        found_long_segment_at = -1 # 0-based index in original_texts\n",
    "\n",
    "        # 1. Build the next potential batch, stopping if a long segment is found\n",
    "        for i in range(current_index, batch_end_index):\n",
    "            segment = original_texts[i]\n",
    "            word_count = count_words(segment)\n",
    "\n",
    "            if word_count <= max_words_per_segment:\n",
    "                batch_texts_to_translate.append(segment)\n",
    "                batch_original_indices.append(i)\n",
    "            else:\n",
    "                found_long_segment_at = i\n",
    "                break # Stop building this batch\n",
    "\n",
    "        # --- Process the findings ---\n",
    "\n",
    "        # 2. Translate the VALID batch collected *before* the long segment (if any)\n",
    "        if batch_texts_to_translate:\n",
    "            # Add delay BEFORE the API call if it's not the very first call\n",
    "            if api_call_counter > 0 and delay_between_requests > 0:\n",
    "                    time.sleep(delay_between_requests)\n",
    "\n",
    "            translated_batch = _translate_batch_helper(\n",
    "                batch_texts_to_translate,\n",
    "                [idx + 1 for idx in batch_original_indices], # 1-based for logging\n",
    "                source_lang,\n",
    "                target_lang\n",
    "            )\n",
    "            api_call_counter += 1\n",
    "            # Assign results back\n",
    "            for batch_idx, original_idx in enumerate(batch_original_indices):\n",
    "                all_results[original_idx] = translated_batch[batch_idx]\n",
    "            processed_count += len(batch_texts_to_translate)\n",
    "\n",
    "        # 3. Handle the long segment INDIVIDUALLY (if one was found)\n",
    "        if found_long_segment_at != -1:\n",
    "            long_segment_index = found_long_segment_at\n",
    "            long_segment_text = str(original_texts[long_segment_index])\n",
    "            # word_count = count_words(long_segment_text) # Recalculate for log clarity\n",
    "\n",
    "            try:\n",
    "                translated = translate_single_text(long_segment_text, source_lang, target_lang)\n",
    "                \n",
    "                final = [translated]\n",
    "                api_call_counter += 1\n",
    "\n",
    "                if len(final) == 1:\n",
    "                    all_results[long_segment_index] = final[0]\n",
    "                else:\n",
    "                    print(f\"    *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.\")\n",
    "                    all_results[long_segment_index] = \"<translation_length_mismatch_error>\"\n",
    "\n",
    "            except Exception as e:\n",
    "                print(f\"    *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.\")\n",
    "                # traceback.print_exc() # Uncomment for detailed debug\n",
    "                all_results[long_segment_index] = \"<translation_api_error>\"\n",
    "                # Do not increment api_call_counter if the API call itself failed before returning\n",
    "\n",
    "            processed_count += 1\n",
    "            # Update current_index to start AFTER this long segment\n",
    "            current_index = long_segment_index + 1\n",
    "\n",
    "        else:\n",
    "            # No long segment was found in the range checked.\n",
    "            # Move current_index to the end of the range examined.\n",
    "            current_index = batch_end_index\n",
    "\n",
    "    missing_count = 0\n",
    "    final_texts_for_nodes = []\n",
    "    for i, res in enumerate(all_results):\n",
    "            if res is None:\n",
    "                print(f\"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'\")\n",
    "                final_texts_for_nodes.append(original_texts[i])\n",
    "                missing_count += 1\n",
    "            else:\n",
    "                final_texts_for_nodes.append(res)\n",
    "\n",
    "    if missing_count > 0:\n",
    "            print(f\"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.\")\n",
    "\n",
    "    if len(final_texts_for_nodes) != len(original_texts):\n",
    "        print(f\"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.\")\n",
    "    else:\n",
    "        # Gán vào node\n",
    "        for i, node_info in enumerate(modifiable_nodes):\n",
    "            node_info['modified_text'] = final_texts_for_nodes[i]\n",
    "        \n",
    "        save_success = apply_and_save_changes(modifiable_nodes, global_data)\n",
    "        if not save_success:\n",
    "            print(\"LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.\")\n",
    "        else:\n",
    "            # Only zip if saving XML was successful\n",
    "            final_id = zip_folder_to_excel_file(xml_folder, file_name)\n",
    "            if final_id:\n",
    "                shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping\n",
    "            else:\n",
    "                print(\"LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.\")\n",
    "    return final_id"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "machine_translate",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}