{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Được thôi, đây là một bài thơ haiku về lập trình Python:\n", "\n", "Cú pháp thật mềm,\n", "Code chạy nhanh, không lỗi lầm.\n", "Vui khi được code.\n", "\n" ] } ], "source": [ "import google.generativeai as genai\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "import os\n", "\n", "genai.configure(api_key=os.getenv(\"GEMINI_API_KEY\")) # Thay thế bằng khóa API của bạn\n", "\n", "model = genai.GenerativeModel(\"gemini-2.0-flash-lite\") # hoặc gemini-1.5-pro-latest\n", "\n", "response = model.generate_content(\"Viết một bài thơ haiku về lập trình Python.\")\n", "print(response.text)\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pymongo import MongoClient\n", "import gridfs\n", "from bson import ObjectId\n", "import os\n", "from pptx import Presentation\n", "from concurrent.futures import ThreadPoolExecutor\n", "from xml.dom import minidom\n", "import xml.etree.ElementTree as ET\n", "# from pptx_object import get_table_properties, get_shape_properties\n", "from pptx.enum.shapes import MSO_SHAPE_TYPE\n", "from typing import Dict, List, Optional\n", "import json\n", "# from translator import translate_text_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Delete file in DB" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def delete_all_files_in_collection(collection_name, db_name=\"ppt\"):\n", " try:\n", " # Kết nối MongoDB\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", "\n", " # Khởi tạo GridFS với collection được chỉ định\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " # Lấy danh sách file_id của tất cả file trong GridFS\n", " file_ids = [file[\"_id\"] for file in db[f\"{collection_name}.files\"].find({})]\n", "\n", " # Xóa từng file trong GridFS\n", " for file_id in file_ids:\n", " fs.delete(file_id)\n", "\n", " print(f\"✅ Đã xóa {len(file_ids)} file trong collection '{collection_name}' của db '{db_name}'\")\n", "\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi xóa file: {str(e)}\")\n", "\n", " finally:\n", " client.close()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa 3 file trong collection 'final_pptx'\n" ] } ], "source": [ "delete_all_files_in_collection(\"final_pptx\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def delete_pptx_from_mongodb(file_id, db_name=\"ppt\", collection_name=\"_xml\"):\n", " \"\"\"\n", " Xóa file PowerPoint khỏi MongoDB theo ID.\n", "\n", " :param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)\n", " :param db_name: Tên database trong MongoDB\n", " :param collection_name: Tên collection GridFS\n", " \"\"\"\n", " # Kết nối đến MongoDB\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " try:\n", " # Chuyển đổi ID nếu cần\n", " if not isinstance(file_id, ObjectId):\n", " file_id = ObjectId(file_id)\n", "\n", " # Kiểm tra file có tồn tại không\n", " if fs.exists(file_id):\n", " fs.delete(file_id)\n", " print(f\"✅ Đã xóa file với ID: {file_id}\")\n", " else:\n", " print(f\"⚠️ Không tìm thấy file với ID: {file_id}\")\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi xóa file: {e}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download file from DB" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def download_input_from_mongodb(file_id, save_name, db_name=\"excel\", collection_name=\"root_file\"):\n", " os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\input\", exist_ok=True)\n", "\n", " full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\input\", save_name)\n", "\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " try:\n", " if not isinstance(file_id, ObjectId):\n", " file_id = ObjectId(file_id)\n", "\n", " file_data = fs.get(file_id)\n", " \n", " with open(full_file_path, \"wb\") as f:\n", " f.write(file_data.read())\n", "\n", " print(f\"✅ File đã được tải về: {full_file_path}\")\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi tải file: {e}\")\n", " finally:\n", " client.close()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def download_output_from_mongodb(file_id, save_name, db_name=\"excel\", collection_name=\"root_file\"):\n", " os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\output\", exist_ok=True)\n", "\n", " full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\output\", save_name)\n", "\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " try:\n", " if not isinstance(file_id, ObjectId):\n", " file_id = ObjectId(file_id)\n", "\n", " file_data = fs.get(file_id)\n", " \n", " with open(full_file_path, \"wb\") as f:\n", " f.write(file_data.read())\n", "\n", " print(f\"✅ File đã được tải về: {full_file_path}\")\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi tải file: {e}\")\n", " finally:\n", " client.close()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\szero-point-ieks-essays-1350537845-9781350537842_compress.docx\n" ] } ], "source": [ "download_input_from_mongodb(file_id=\"6843696876015abc15cc759f\", save_name=\"szero-point-ieks-essays-1350537845-9781350537842_compress.docx\", db_name=\"word\", collection_name=\"root_file\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\samsung_presentation_vietnamese.pptx\n" ] } ], "source": [ "download_output_from_mongodb(file_id=\"684194c376015abc15cc7428\", save_name=\"samsung_presentation_vietnamese.pptx\", db_name=\"pptx\", collection_name=\"final_file\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# List all file " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa 1 file trong collection 'root_file'\n", "✅ Đã xóa 0 file trong collection 'final_pptx'\n", "✅ Đã xóa 0 file trong collection 'original_xml'\n", "✅ Đã xóa 0 file trong collection 'final_xml'\n" ] } ], "source": [ "for i in ['root_file', 'final_pptx', 'original_xml', 'final_xml']:\n", "\n", " delete_all_files_in_collection(i, db_name = 'pptx')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa 0 file trong collection 'root_file' của db 'word'\n", "✅ Đã xóa 8 file trong collection 'root_file' của db 'excel'\n", "✅ Đã xóa 0 file trong collection 'root_file' của db 'pptx'\n", "✅ Đã xóa 0 file trong collection 'root_file' của db 'csv'\n", "✅ Đã xóa 0 file trong collection 'final_file' của db 'word'\n", "✅ Đã xóa 7 file trong collection 'final_file' của db 'excel'\n", "✅ Đã xóa 0 file trong collection 'final_file' của db 'pptx'\n", "✅ Đã xóa 0 file trong collection 'final_file' của db 'csv'\n" ] } ], "source": [ "for i in ['root_file', 'final_file']:\n", " for j in ['word', 'excel', 'pptx', 'csv']:\n", " delete_all_files_in_collection(i, db_name=j)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", "total_size = 0\n", "\n", "for db_name in ['word', 'exce', 'pptx', 'csv']:\n", " db = client[db_name]\n", " stats = db.command(\"dbstats\")\n", " db_size = stats.get(\"StorageSize\", 0)\n", " total_size += db_size\n", "total_size / (1024** 2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã lấy 0 dữ liệu đo lường.\n", "⚠️ Không có dữ liệu đo lường.\n" ] } ], "source": [ "import requests\n", "from requests.auth import HTTPDigestAuth\n", "import datetime\n", "\n", "# ==== Cấu hình ====\n", "PUBLIC_KEY = 'uetgyqkj'\n", "PRIVATE_KEY = '892caec5-8474-4043-862b-f4d4c617daa2'\n", "GROUP_ID = '67db8bf4ed971c2114aad7f1#' # còn gọi là Project ID\n", "CLUSTER_NAME = 'Cluster0'\n", "\n", "# ==== Lấy metric dung lượng dữ liệu ====\n", "url = f\"https://cloud.mongodb.com/api/atlas/v1.0/groups/{GROUP_ID}/clusters/{CLUSTER_NAME}/measurements\"\n", "\n", "params = {\n", " \"granularity\": \"PT1M\", # lấy theo từng phút\n", " \"period\": \"PT1H\", # 5 phút gần nhất\n", " \"m\": \"DATA_SIZE_TOTAL\", # metric cần lấy\n", "}\n", "\n", "response = requests.get(\n", " url,\n", " auth=HTTPDigestAuth(PUBLIC_KEY, PRIVATE_KEY),\n", " params=params\n", ")\n", "\n", "# ==== Xử lý kết quả ====\n", "if response.status_code == 200:\n", " data = response.json()\n", " measurements = data.get(\"measurements\", [])\n", " print(f\"✅ Đã lấy {len(measurements)} dữ liệu đo lường.\")\n", " if measurements:\n", " datapoints = measurements[0].get(\"dataPoints\", [])\n", " if datapoints:\n", " latest_point = [d for d in datapoints if d['value'] is not None][-1]\n", " value_bytes = latest_point['value']\n", " ts = latest_point['timestamp']\n", " print(f\"✅ Dung lượng hiện tại: {value_bytes / (1024**2):.2f} MB (timestamp: {ts})\")\n", " else:\n", " print(\"⚠️ Không có datapoint nào.\")\n", " else:\n", " print(\"⚠️ Không có dữ liệu đo lường.\")\n", "else:\n", " print(f\"❌ Lỗi {response.status_code}: {response.text}\")\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2025-06-05 15:19:25'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import time \n", "now = time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n", "now" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "ename": "OperationFailure", "evalue": "(Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }, full error: {'ok': 0, 'errmsg': '(Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }', 'code': 8000, 'codeName': 'AtlasError'}", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mOperationFailure\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[27], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtime\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madmin\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommand\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfsync\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Force flush\u001b[39;00m\n\u001b[0;32m 4\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m1\u001b[39m)\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\_csot.py:119\u001b[0m, in \u001b[0;36mapply..csot_wrapper\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 117\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _TimeoutContext(timeout):\n\u001b[0;32m 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m--> 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\database.py:930\u001b[0m, in \u001b[0;36mDatabase.command\u001b[1;34m(self, command, value, check, allowable_errors, read_preference, codec_options, session, comment, **kwargs)\u001b[0m\n\u001b[0;32m 925\u001b[0m read_preference \u001b[38;5;241m=\u001b[39m (session \u001b[38;5;129;01mand\u001b[39;00m session\u001b[38;5;241m.\u001b[39m_txn_read_preference()) \u001b[38;5;129;01mor\u001b[39;00m ReadPreference\u001b[38;5;241m.\u001b[39mPRIMARY\n\u001b[0;32m 926\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39m_conn_for_reads(read_preference, session, operation\u001b[38;5;241m=\u001b[39mcommand_name) \u001b[38;5;28;01mas\u001b[39;00m (\n\u001b[0;32m 927\u001b[0m connection,\n\u001b[0;32m 928\u001b[0m read_preference,\n\u001b[0;32m 929\u001b[0m ):\n\u001b[1;32m--> 930\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_command(\n\u001b[0;32m 931\u001b[0m connection,\n\u001b[0;32m 932\u001b[0m command,\n\u001b[0;32m 933\u001b[0m value,\n\u001b[0;32m 934\u001b[0m check,\n\u001b[0;32m 935\u001b[0m allowable_errors,\n\u001b[0;32m 936\u001b[0m read_preference,\n\u001b[0;32m 937\u001b[0m opts, \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m 938\u001b[0m session\u001b[38;5;241m=\u001b[39msession,\n\u001b[0;32m 939\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 940\u001b[0m )\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\database.py:770\u001b[0m, in \u001b[0;36mDatabase._command\u001b[1;34m(self, conn, command, value, check, allowable_errors, read_preference, codec_options, write_concern, parse_write_concern_error, session, **kwargs)\u001b[0m\n\u001b[0;32m 768\u001b[0m command\u001b[38;5;241m.\u001b[39mupdate(kwargs)\n\u001b[0;32m 769\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39m_tmp_session(session) \u001b[38;5;28;01mas\u001b[39;00m s:\n\u001b[1;32m--> 770\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommand\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 771\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 772\u001b[0m \u001b[43m \u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 773\u001b[0m \u001b[43m \u001b[49m\u001b[43mread_preference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 774\u001b[0m \u001b[43m \u001b[49m\u001b[43mcodec_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 775\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 776\u001b[0m \u001b[43m \u001b[49m\u001b[43mallowable_errors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 777\u001b[0m \u001b[43m \u001b[49m\u001b[43mwrite_concern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwrite_concern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 778\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 779\u001b[0m \u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 780\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 781\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\helpers.py:47\u001b[0m, in \u001b[0;36m_handle_reauth..inner\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpymongo\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynchronous\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpool\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Connection\n\u001b[0;32m 46\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 47\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 48\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m OperationFailure \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m 49\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m no_reauth:\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\pool.py:536\u001b[0m, in \u001b[0;36mConnection.command\u001b[1;34m(self, dbname, spec, read_preference, codec_options, check, allowable_errors, read_concern, write_concern, parse_write_concern_error, collation, session, client, retryable_write, publish_events, user_fields, exhaust_allowed)\u001b[0m\n\u001b[0;32m 534\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_if_not_writable(unacknowledged)\n\u001b[0;32m 535\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 536\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcommand\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mdbname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43mspec\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_mongos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43mread_preference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mcodec_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 543\u001b[0m \u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 544\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 545\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 546\u001b[0m \u001b[43m \u001b[49m\u001b[43mallowable_errors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 547\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maddress\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 548\u001b[0m \u001b[43m \u001b[49m\u001b[43mlisteners\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 549\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_bson_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 550\u001b[0m \u001b[43m \u001b[49m\u001b[43mread_concern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43mcollation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcollation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 553\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression_ctx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompression_context\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 554\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_op_msg\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mop_msg_enabled\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 555\u001b[0m \u001b[43m \u001b[49m\u001b[43munacknowledged\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43munacknowledged\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 556\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_fields\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_fields\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 557\u001b[0m \u001b[43m \u001b[49m\u001b[43mexhaust_allowed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexhaust_allowed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 558\u001b[0m \u001b[43m \u001b[49m\u001b[43mwrite_concern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwrite_concern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 559\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 560\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (OperationFailure, NotPrimaryError):\n\u001b[0;32m 561\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\synchronous\\network.py:213\u001b[0m, in \u001b[0;36mcommand\u001b[1;34m(conn, dbname, spec, is_mongos, read_preference, codec_options, session, client, check, allowable_errors, address, listeners, max_bson_size, read_concern, parse_write_concern_error, collation, compression_ctx, use_op_msg, unacknowledged, user_fields, exhaust_allowed, write_concern)\u001b[0m\n\u001b[0;32m 211\u001b[0m client\u001b[38;5;241m.\u001b[39m_process_response(response_doc, session)\n\u001b[0;32m 212\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check:\n\u001b[1;32m--> 213\u001b[0m \u001b[43mhelpers_shared\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_command_response\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_doc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 215\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_wire_version\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 216\u001b[0m \u001b[43m \u001b[49m\u001b[43mallowable_errors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 217\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_write_concern_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 218\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 219\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m 220\u001b[0m duration \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mdatetime\u001b[38;5;241m.\u001b[39mnow() \u001b[38;5;241m-\u001b[39m start\n", "File \u001b[1;32md:\\Miniconda\\envs\\machine_translate\\lib\\site-packages\\pymongo\\helpers_shared.py:247\u001b[0m, in \u001b[0;36m_check_command_response\u001b[1;34m(response, max_wire_version, allowable_errors, parse_write_concern_error)\u001b[0m\n\u001b[0;32m 244\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m43\u001b[39m:\n\u001b[0;32m 245\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CursorNotFound(errmsg, code, response, max_wire_version)\n\u001b[1;32m--> 247\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m OperationFailure(errmsg, code, response, max_wire_version)\n", "\u001b[1;31mOperationFailure\u001b[0m: (Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }, full error: {'ok': 0, 'errmsg': '(Unauthorized) not authorized on admin to execute command { fsync: 1, lsid: { id: {4 [50 114 225 195 219 36 79 24 143 231 27 7 151 76 44 22]} }, $clusterTime: { clusterTime: {1749030237 5}, signature: { hash: {0 [221 223 81 8 201 103 38 61 210 190 116 79 255 71 28 215 34 19 176 79]}, keyId: 7456826922580836352.000000 } }, $db: \"admin\" }', 'code': 8000, 'codeName': 'AtlasError'}" ] } ], "source": [ "import time\n", "\n", "client.admin.command(\"fsync\") # Force flush\n", "time.sleep(1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def file_list(collection=\"root_file\"):\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[\"ppt\"]\n", " fs = gridfs.GridFS(db, collection=collection)\n", " for file in fs.find():\n", " print(f\"📂 File: {file.filename} - ID: {file._id}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcd8c575cfef63155d3f91\n", "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcea4f02257ad0cb04610e\n", "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcead0143da29a5c6321ab\n", "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dd3bf23cf7ee2f6eca902e\n" ] } ], "source": [ "file_list(\"root_file\")\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa file với ID: 67d383681d4db191e51b0bd8\n" ] } ], "source": [ "delete_pptx_from_mongodb(file_id='67d383681d4db191e51b0bd8')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from io import BytesIO\n", "from openpyxl import load_workbook\n", "from utils.utils import translate_single_text\n", "from pymongo import MongoClient\n", "from bson import ObjectId\n", "import gridfs\n", "from utils.utils import unzip_office_file\n", "\n", "\n", "# 1. Kết nối tới MongoDB\n", "client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", "db = client['excel']\n", "fs = gridfs.GridFS(db, collection='root_file')\n", "\n", "# 2. Tải file Excel từ MongoDB\n", "file_obj = fs.get(ObjectId('6836c69e59530f034bd86576'))\n", "excel_file = BytesIO(file_obj.read())\n", "\n", "xml_folder = unzip_office_file(r\"D:\\Show_me_everything\\Machine Translation\\input\\sample.xlsx\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET\n", "from typing import List\n", "from utils.utils import translate_single_text\n", "\n", "def _register_all_namespaces(xml_file_path):\n", " try:\n", " namespaces = dict([\n", " node for _, node in ET.iterparse(xml_file_path, events=['start-ns'])\n", " ])\n", " for prefix, uri in namespaces.items():\n", " ET.register_namespace(prefix, uri)\n", " if '' in namespaces and namespaces[''] == \"http://schemas.openxmlformats.org/spreadsheetml/2006/main\":\n", " ET.register_namespace('', \"http://schemas.openxmlformats.org/spreadsheetml/2006/main\")\n", " if 'r' not in namespaces:\n", " ET.register_namespace('r', \"http://schemas.openxmlformats.org/officeDocument/2006/relationships\")\n", " if 'mc' not in namespaces:\n", " ET.register_namespace('mc', \"http://schemas.openxmlformats.org/markup-compatibility/2006\")\n", " if 'x15' not in namespaces:\n", " ET.register_namespace('x15', \"http://schemas.microsoft.com/office/spreadsheetml/2010/11/main\")\n", " return namespaces # Trả về để có thể dùng trong findall nếu cần\n", " except Exception as e:\n", " print(f\"Lỗi khi đăng ký namespace từ {xml_file_path}: {e}\")\n", " return {}\n", "\n", "def get_all_sheet_names_from_file(workbook_xml_path, source_lang='chinese', target_lang='vietnamese') -> List[str]:\n", " sheet_names = {}\n", " try:\n", " # Phân tích trực tiếp từ file\n", " tree = ET.parse(workbook_xml_path)\n", " root = tree.getroot()\n", "\n", " namespaces = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n", "\n", " for sheet_element in root.findall('.//main:sheet', namespaces):\n", " name = sheet_element.get('name')\n", " if name:\n", " sheet_names[name] = name\n", " for k in sheet_names:\n", " sheet_names[k] = translate_single_text(k, source_lang, target_lang)\n", "\n", " except FileNotFoundError:\n", " print(f\"Lỗi: Không tìm thấy file tại '{workbook_xml_path}'\")\n", " except ET.ParseError as e:\n", " print(f\"Lỗi khi phân tích XML từ file '{workbook_xml_path}': {e}\")\n", " except Exception as e:\n", " print(f\"Đã xảy ra lỗi không mong muốn khi xử lý file '{workbook_xml_path}': {e}\")\n", "\n", " return sheet_names\n", "\n", "def translate_sheet_names_in_file(workbook_xml_path, source_lang = 'chinese', target_lang = 'vietnamese'):\n", "\n", " original_to_translated_map: Dict[str, str] = {}\n", " file_modified = False\n", "\n", " try:\n", " parsed_namespaces = _register_all_namespaces(workbook_xml_path)\n", " tree = ET.parse(workbook_xml_path)\n", " root = tree.getroot()\n", " search_namespaces = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n", " if '' in parsed_namespaces and parsed_namespaces[''] == search_namespaces['main']:\n", " pass # search_namespaces đã ổn\n", "\n", "\n", " # Bước 3: Lặp qua các sheet, dịch và cập nhật tên\n", " for sheet_element in root.findall('.//main:sheet', search_namespaces):\n", " original_name = sheet_element.get('name')\n", " if original_name and original_name.strip(): # Chỉ dịch nếu tên có nội dung\n", " try:\n", " # Gọi hàm dịch của bạn\n", " translated_name = translate_single_text(original_name, source_lang, target_lang)\n", " os.wait(5)\n", "\n", " if translated_name and translated_name.strip() and translated_name != original_name:\n", " # Cập nhật thuộc tính 'name' của element trong cây XML\n", " sheet_element.set('name', translated_name)\n", " original_to_translated_map[original_name] = translated_name\n", " file_modified = True\n", " print(f\"Đã dịch sheet: '{original_name}' -> '{translated_name}'\")\n", " else:\n", " # Nếu dịch thất bại hoặc không thay đổi, giữ lại tên gốc trong map\n", " original_to_translated_map[original_name] = original_name\n", " if translated_name and translated_name != original_name :\n", " print(f\"Bản dịch cho '{original_name}' trống hoặc không hợp lệ, không cập nhật XML.\")\n", " elif not translated_name:\n", " print(f\"Dịch thất bại cho '{original_name}', không cập nhật XML.\")\n", "\n", "\n", " except Exception as e_translate:\n", " print(f\"Lỗi khi dịch tên sheet '{original_name}': {e_translate}\")\n", " original_to_translated_map[original_name] = original_name # Ghi nhận lỗi, giữ tên gốc\n", "\n", " # Bước 4: Nếu có thay đổi, ghi lại toàn bộ cây XML vào file\n", " if file_modified:\n", " # encoding='utf-8' và xml_declaration=True là quan trọng\n", " tree.write(workbook_xml_path, encoding='utf-8', xml_declaration=True)\n", " print(f\"Đã cập nhật thành công file: {workbook_xml_path}\")\n", " else:\n", " print(f\"Không có tên sheet nào được thay đổi trong file: {workbook_xml_path}\")\n", "\n", " except FileNotFoundError:\n", " print(f\"Lỗi: Không tìm thấy file tại '{workbook_xml_path}'\")\n", " except ET.ParseError as e:\n", " print(f\"Lỗi khi phân tích XML từ file '{workbook_xml_path}': {e}\")\n", " except Exception as e:\n", " print(f\"Đã xảy ra lỗi không mong muốn khi xử lý file '{workbook_xml_path}': {e}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "\n", " return original_to_translated_map" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Đã dịch sheet: '流程图' -> 'Lưu đồ'\n", "Đã dịch sheet: '分割光源板' -> 'Tấm dẫn sáng phân đoạn'\n", "Đã dịch sheet: '支架外观检验' -> 'Kiểm tra ngoại quan giá đỡ'\n", "Đã dịch sheet: '固定驱动' -> 'Ổ đĩa cố định'\n", "Đã dịch sheet: '固定接地线' -> 'Dây nối đất cố định'\n", "Đã dịch sheet: '整理电源线' -> 'Sắp xếp dây nguồn'\n", "Đã dịch sheet: '固定光源板' -> 'Bảng nguồn sáng cố định'\n", "Đã dịch sheet: '连接光源板' -> 'Kết nối bảng nguồn sáng'\n", "Đã dịch sheet: '焊接光源板' -> 'Hàn bảng nguồn sáng'\n", "Đã dịch sheet: '连接驱动' -> 'Kết nối trình điều khiển'\n", "Đã dịch sheet: '安装端头 ' -> 'Lắp đặt đầu cuối'\n", "Đã dịch sheet: '试 亮' -> 'Thử Sáng'\n", "Đã dịch sheet: '绝缘、接地 ' -> 'Cách điện, tiếp đất'\n", "Đã dịch sheet: '安装透光罩 (2)' -> 'Lắp chụp đèn (2)'\n", "Đã dịch sheet: '老练' -> 'Lão luyện'\n", "Đã dịch sheet: '二次试亮' -> 'Thử sáng lần hai'\n", "Đã cập nhật thành công file: D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml\n" ] }, { "data": { "text/plain": [ "{'流程图': 'Lưu đồ',\n", " '分割光源板': 'Tấm dẫn sáng phân đoạn',\n", " '支架外观检验': 'Kiểm tra ngoại quan giá đỡ',\n", " '固定驱动': 'Ổ đĩa cố định',\n", " '固定接地线': 'Dây nối đất cố định',\n", " '整理电源线': 'Sắp xếp dây nguồn',\n", " '固定光源板': 'Bảng nguồn sáng cố định',\n", " '连接光源板': 'Kết nối bảng nguồn sáng',\n", " '焊接光源板': 'Hàn bảng nguồn sáng',\n", " '连接驱动': 'Kết nối trình điều khiển',\n", " '安装端头 ': 'Lắp đặt đầu cuối',\n", " '试 亮': 'Thử Sáng',\n", " '绝缘、接地 ': 'Cách điện, tiếp đất',\n", " '安装透光罩 (2)': 'Lắp chụp đèn (2)',\n", " '老练': 'Lão luyện',\n", " '二次试亮': 'Thử sáng lần hai'}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "translate_sheet_names_in_file(r'D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'流程图': 'Lưu đồ',\n", " '分割光源板': 'Tấm dẫn sáng phân đoạn',\n", " '支架外观检验': 'Kiểm tra ngoại quan của giá đỡ',\n", " '固定驱动': 'Ổ đĩa cố định',\n", " '固定接地线': 'Dây nối đất cố định',\n", " '整理电源线': 'Sắp xếp dây nguồn',\n", " '固定光源板': 'Tấm nền nguồn sáng cố định',\n", " '连接光源板': 'Kết nối bảng nguồn sáng',\n", " '焊接光源板': 'Hàn bảng nguồn sáng',\n", " '连接驱动': 'Kết nối trình điều khiển',\n", " '安装端头 ': 'Lắp đặt đầu cuối',\n", " '试 亮': 'Thử Lượng',\n", " '绝缘、接地 ': 'Cách điện, tiếp đất',\n", " '安装透光罩 (2)': 'Lắp đặt chụp đèn trong suốt (2)',\n", " '老练': 'Lão luyện',\n", " '二次试亮': 'Thử sáng lần hai'}" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sheets_name_d" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "from typing import Dict, Callable\n", "# Giả sử utils.utils.translate_single_text đã được import và hoạt động đúng\n", "# từ file trước: from utils.utils import translate_single_text\n", "\n", "def translate_sheet_names_via_regex(\n", " workbook_xml_path: str,\n", " source_lang: str = 'chinese',\n", " target_lang: str = 'vietnamese'\n", ") -> (Dict[str, str], bool):\n", "\n", " original_to_translated_map: Dict[str, str] = {}\n", " modified_content: str = \"\"\n", " file_changed_flag: bool = False # Sử dụng tên biến rõ ràng hơn\n", "\n", " try:\n", " with open(workbook_xml_path, 'r', encoding='utf-8') as f:\n", " content = f.read()\n", "\n", " current_content = content\n", "\n", " def replace_name_callback(match_obj):\n", " nonlocal file_changed_flag # Để sửa đổi biến bên ngoài\n", " nonlocal original_to_translated_map\n", "\n", " attr_prefix = match_obj.group(1) # Ví dụ: ''\n", "\n", " original_name = original_name_xml_encoded # Tạm thời bỏ qua unescape/escape cho đơn giản ví dụ\n", "\n", " if not original_name.strip():\n", " return match_obj.group(0) # Trả về chuỗi gốc nếu tên rỗng\n", "\n", " translated_name = original_name # Mặc định giữ nguyên\n", " if original_name in original_to_translated_map and original_to_translated_map[original_name] != original_name:\n", " translated_name = original_to_translated_map[original_name]\n", " # Nếu đã dịch và có thay đổi, không cần gọi API dịch nữa\n", " if translated_name != original_name: # Cần kiểm tra lại vì map có thể lưu tên gốc nếu dịch lỗi\n", " print(f\"Regex: Sử dụng bản dịch đã có cho '{original_name}' -> '{translated_name}'\")\n", " file_changed_flag = True # Đảm bảo cờ được set nếu sử dụng bản dịch đã có mà khác gốc\n", " else:\n", " try:\n", " translated_name_raw = translate_single_text(original_name, source_lang, target_lang)\n", "\n", " if translated_name_raw and translated_name_raw.strip() and translated_name_raw != original_name:\n", " translated_name = translated_name_raw[:31]\n", " original_to_translated_map[original_name] = translated_name\n", " file_changed_flag = True\n", " print(f\"Regex: Đã dịch sheet: '{original_name}' -> '{translated_name}'\")\n", " else:\n", " original_to_translated_map[original_name] = original_name # Lưu tên gốc nếu dịch lỗi/không đổi\n", " # translated_name vẫn là original_name\n", " if translated_name_raw and translated_name_raw.strip() and translated_name_raw == original_name:\n", " print(f\"Bản dịch cho '{original_name}' giống hệt bản gốc, không thay đổi.\")\n", " elif not (translated_name_raw and translated_name_raw.strip()):\n", " print(f\"Bản dịch cho '{original_name}' trống hoặc không hợp lệ, giữ nguyên.\")\n", "\n", " except Exception as e_translate:\n", " print(f\"Lỗi khi gọi hàm dịch cho '{original_name}': {e_translate}\")\n", " original_to_translated_map[original_name] = original_name\n", "\n", " translated_name_xml_encoded = translated_name # Tạm thời bỏ qua escape\n", "\n", " return f\"{attr_prefix}{opening_quote}{translated_name_xml_encoded}{opening_quote}{attr_suffix}\"\n", "\n", " sheet_name_pattern = re.compile(\n", " r'(]*?\\sname=)([\"\\'])((?:(?!\\2).)*?)(\\2)([^>]*?>)'\n", " )\n", " modified_content = sheet_name_pattern.sub(replace_name_callback, current_content)\n", "\n", " if file_changed_flag:\n", " with open(workbook_xml_path, 'w', encoding='utf-8') as f:\n", " f.write(modified_content)\n", " print(f\"Regex: Đã cập nhật thành công file: {workbook_xml_path}\")\n", " return original_to_translated_map, True\n", " else:\n", " print(f\"Regex: Không có tên sheet nào được thay đổi trong file: {workbook_xml_path}\")\n", "\n", " except FileNotFoundError:\n", " print(f\"Lỗi: Không tìm thấy file tại '{workbook_xml_path}'\")\n", " except Exception as e:\n", " print(f\"Đã xảy ra lỗi không mong muốn khi xử lý file '{workbook_xml_path}' bằng regex: {e}\")\n", " import traceback\n", " traceback.print_exc()\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Regex: Đã dịch sheet: '流程图' -> 'Lưu đồ'\n", "Regex: Đã dịch sheet: '分割光源板' -> 'Tấm dẫn sáng phân đoạn'\n", "Regex: Đã dịch sheet: '支架外观检验' -> 'Kiểm tra ngoại quan giá đỡ'\n", "Regex: Đã dịch sheet: '固定驱动' -> 'Ổ đĩa cố định'\n", "Regex: Đã dịch sheet: '固定接地线' -> 'Dây tiếp đất cố định'\n", "Regex: Đã dịch sheet: '整理电源线' -> 'Sắp xếp dây nguồn'\n", "Regex: Đã dịch sheet: '固定光源板' -> 'Bảng nguồn sáng cố định'\n", "Regex: Đã dịch sheet: '连接光源板' -> 'Kết nối bảng đèn nền'\n", "Regex: Đã dịch sheet: '焊接光源板' -> 'Bảng nguồn sáng hàn'\n", "Regex: Đã dịch sheet: '连接驱动' -> 'Kết nối trình điều khiển'\n", "Regex: Đã dịch sheet: '安装端头 ' -> 'Lắp đặt đầu cuối'\n", "Regex: Đã dịch sheet: '试 亮' -> 'Thử Sáng'\n", "Regex: Đã dịch sheet: '绝缘、接地 ' -> 'Cách điện, tiếp đất'\n", "Regex: Đã dịch sheet: '安装透光罩 (2)' -> 'Lắp chụp đèn (2)'\n", "Regex: Đã dịch sheet: '老练' -> 'Lão luyện'\n", "Regex: Đã dịch sheet: '二次试亮' -> 'Thử sáng lần hai'\n", "Regex: Đã cập nhật thành công file: D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml\n" ] } ], "source": [ "translation_map_regex, success_regex = translate_sheet_names_via_regex(r'D:\\Show_me_everything\\MT deploy\\MT_deploy\\unzipped_office\\xl\\workbook.xml')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "二次试亮!$A$1:$L$39分割光源板!$A$1:$L$40固定光源板!$A$1:$L$40固定接地线!$A$1:$L$40固定驱动!$A$1:$L$40'安装端头 '!$A$1:$L$40整理电源线!$A$1:$L$40流程图!$A$1:$L$40焊接光源板!$A$1:$L$40'绝缘、接地 '!$A$1:$L$40老练!$A$1:$L$40'试 亮'!$A$1:$L$39连接光源板!$A$1:$L$40连接驱动!$A$1:$L$40\n", "\n", "二次试亮!$A$1:$L$39分割光源板!$A$1:$L$40固定光源板!$A$1:$L$40固定接地线!$A$1:$L$40固定驱动!$A$1:$L$40'安装端头 '!$A$1:$L$40整理电源线!$A$1:$L$40流程图!$A$1:$L$40焊接光源板!$A$1:$L$40'绝缘、接地 '!$A$1:$L$40老练!$A$1:$L$40'试 亮'!$A$1:$L$39连接光源板!$A$1:$L$40连接驱动!$A$1:$L$40\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "str = 'Kiểm tra ngoại quan giá đỡ'\n", "len(str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import zipfile\n", "import copy\n", "import time\n", "import xml.etree.ElementTree as ET\n", "from typing import List, Dict, Any, Optional, Tuple\n", "from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text\n", "from pymongo import MongoClient\n", "import gridfs\n", "from io import BytesIO\n", "import shutil\n", "import io\n", "import re\n", "from typing import Dict\n", "\n", "\n", "NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n", "NS_DRAWING = {'xdr': \"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing\"}\n", "NS_A = {'a': \"http://schemas.openxmlformats.org/drawingml/2006/main\"}\n", "\n", "# --- Hàm đăng ký namespace (quan trọng khi ghi file) ---\n", "def register_namespaces(xml_file):\n", " \"\"\"Đọc và đăng ký các namespace từ file XML.\"\"\"\n", " namespaces = dict([\n", " node for _, node in ET.iterparse(xml_file, events=['start-ns'])\n", " ])\n", " for ns, uri in namespaces.items():\n", " ET.register_namespace(ns, uri)\n", "\n", " # Đăng ký thêm namespace phổ biến nếu chưa có\n", " if 'main' not in namespaces and '' not in namespaces and NS_MAIN['main'] not in namespaces.values():\n", " ET.register_namespace('', NS_MAIN['main'])\n", " elif 'main' not in namespaces and NS_MAIN['main'] not in namespaces.values():\n", " ET.register_namespace('main', NS_MAIN['main'])\n", "\n", " # Đăng ký namespaces cho drawing nếu cần\n", " if 'xdr' not in namespaces and NS_DRAWING['xdr'] not in namespaces.values():\n", " ET.register_namespace('xdr', NS_DRAWING['xdr'])\n", " if 'a' not in namespaces and NS_A['a'] not in namespaces.values():\n", " ET.register_namespace('a', NS_A['a'])\n", "\n", "\n", "def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:\n", " \"\"\"\n", " Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text,\n", " bao gồm cả text từ TextBoxes trong drawings.\n", " \"\"\"\n", " modifiable_nodes = []\n", " shared_strings_path = os.path.join(unzipped_folder_path, \"xl\", \"sharedStrings.xml\")\n", " worksheets_folder = os.path.join(unzipped_folder_path, \"xl\", \"worksheets\")\n", " drawings_folder = os.path.join(unzipped_folder_path, \"xl\", \"drawings\") # Thêm dòng này\n", "\n", " shared_tree = None\n", " sheet_trees = {}\n", " drawing_trees = {} # Thêm dòng này\n", "\n", " # --- Xử lý sharedStrings.xml ---\n", " if os.path.exists(shared_strings_path):\n", " try:\n", " register_namespaces(shared_strings_path) # Đảm bảo register_namespaces được gọi\n", " shared_tree = ET.parse(shared_strings_path)\n", " root_shared = shared_tree.getroot()\n", "\n", " for si_element in root_shared.findall('main:si', NS_MAIN):\n", " text_parts = []\n", " # Tìm tất cả con, bất kể chúng nằm trong hay không\n", " t_elements = si_element.findall('.//main:t', NS_MAIN)\n", "\n", " first_r = si_element.find('./main:r', NS_MAIN)\n", " first_rpr_clone = None\n", " is_rich_text = first_r is not None # Rich text nếu có ít nhất một \n", "\n", " if is_rich_text:\n", " # Cố gắng tìm bên trong đầu tiên\n", " first_rpr_candidate = si_element.find('./main:r/main:rPr', NS_MAIN)\n", " if first_rpr_candidate is not None:\n", " first_rpr_clone = copy.deepcopy(first_rpr_candidate)\n", " else:\n", " # Nếu đầu tiên không có , kiểm tra (Phonetic properties, ít gặp hơn)\n", " # Hoặc có thể không có định dạng nào cụ thể ở run đầu\n", " pass\n", "\n", "\n", " for t_node in t_elements:\n", " if t_node.text:\n", " text_parts.append(t_node.text)\n", " full_text = \"\".join(text_parts)\n", "\n", " if not full_text or full_text.isspace(): continue\n", "\n", " # Logic xác định type dựa trên sự hiện diện của đã được điều chỉnh\n", " if is_rich_text : # Chỉ cần có là đủ, first_rpr_clone có thể là None\n", " modifiable_nodes.append({\n", " 'type': 'shared_rich',\n", " 'original_text': full_text,\n", " 'element': si_element,\n", " 'first_format': first_rpr_clone, # Sẽ là None nếu đầu không có \n", " 'source_file': os.path.join(\"xl\", \"sharedStrings.xml\"),\n", " 'sheet_name': None\n", " })\n", " elif t_elements:\n", " direct_t = si_element.find('./main:t', NS_MAIN)\n", " if direct_t is not None:\n", " modifiable_nodes.append({\n", " 'type': 'shared_simple',\n", " 'original_text': full_text,\n", " 'element': direct_t, # Tham chiếu \n", " 'first_format': None,\n", " 'source_file': os.path.join(\"xl\", \"sharedStrings.xml\"),\n", " 'sheet_name': None\n", " })\n", " # else: ít khả năng xảy ra nếu t_elements có phần tử\n", "\n", " except Exception as e:\n", " print(f\"Lỗi xử lý sharedStrings: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "\n", " # --- Xử lý các file sheetX.xml (Inline Strings) ---\n", " if os.path.isdir(worksheets_folder):\n", " for sheet_filename in sorted(os.listdir(worksheets_folder)):\n", " if sheet_filename.lower().endswith(\".xml\"):\n", " sheet_file_path = os.path.join(worksheets_folder, sheet_filename)\n", " try:\n", " register_namespaces(sheet_file_path) # Đảm bảo register_namespaces được gọi\n", " sheet_tree = ET.parse(sheet_file_path)\n", " sheet_trees[sheet_filename] = sheet_tree\n", " root_sheet = sheet_tree.getroot()\n", " for cell in root_sheet.findall('.//main:c[@t=\"inlineStr\"]', NS_MAIN):\n", " t_element = cell.find('.//main:is/main:t', NS_MAIN) # Sửa lại tìm kiếm \n", " if t_element is not None and t_element.text is not None and t_element.text.strip():\n", " modifiable_nodes.append({\n", " 'type': 'inline',\n", " 'original_text': t_element.text,\n", " 'element': t_element,\n", " 'first_format': None,\n", " 'source_file': os.path.join(\"xl\", \"worksheets\", sheet_filename),\n", " 'sheet_name': sheet_filename\n", " })\n", " except Exception as e:\n", " print(f\"Lỗi xử lý sheet {sheet_filename}: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", " else:\n", " print(f\"Cảnh báo: Không tìm thấy thư mục worksheets: {worksheets_folder}\")\n", "\n", "\n", " # --- Xử lý các file drawingX.xml (Text Boxes, Shapes with Text) ---\n", " if os.path.isdir(drawings_folder):\n", " for drawing_filename in sorted(os.listdir(drawings_folder)):\n", " if drawing_filename.lower().endswith(\".xml\"):\n", " drawing_file_path = os.path.join(drawings_folder, drawing_filename)\n", " try:\n", " register_namespaces(drawing_file_path) # Đảm bảo register_namespaces được gọi\n", " drawing_tree = ET.parse(drawing_file_path)\n", " drawing_trees[drawing_filename] = drawing_tree\n", " root_drawing = drawing_tree.getroot()\n", "\n", " # TextBoxes và Shapes có text thường nằm trong (shape) -> (text body)\n", " # Bên trong là các (paragraph)\n", " for p_element in root_drawing.findall('.//xdr:txBody/a:p', {**NS_DRAWING, **NS_A}):\n", " text_parts = []\n", " # Lấy text từ tất cả trong paragraph này\n", " t_elements = p_element.findall('.//a:t', NS_A)\n", "\n", " first_r = p_element.find('./a:r', NS_A) # Tìm con trực tiếp đầu tiên của \n", " first_rpr_clone = None # Định dạng của run đầu tiên trong paragraph\n", "\n", " is_rich_text_paragraph = first_r is not None # Coi là rich nếu có \n", "\n", " if is_rich_text_paragraph:\n", " # Tìm bên trong đầu tiên của \n", " first_rpr = first_r.find('./a:rPr', NS_A)\n", " if first_rpr is not None:\n", " first_rpr_clone = copy.deepcopy(first_rpr)\n", "\n", " for t_node in t_elements:\n", " if t_node.text:\n", " text_parts.append(t_node.text)\n", " full_text = \"\".join(text_parts)\n", "\n", " if not full_text or full_text.isspace(): continue\n", "\n", " # Lưu node là vì chúng ta sẽ thay thế toàn bộ nội dung của nó\n", " # (các bên trong)\n", " modifiable_nodes.append({\n", " 'type': 'drawing_text', # Loại mới cho text trong drawing\n", " 'original_text': full_text,\n", " 'element': p_element, # Tham chiếu đến \n", " 'first_format': first_rpr_clone, # Lưu định dạng của đầu tiên (hoặc None)\n", " 'source_file': os.path.join(\"xl\", \"drawings\", drawing_filename),\n", " 'sheet_name': None # Có thể tìm cách liên kết ngược lại sheet nếu cần\n", " })\n", " except Exception as e:\n", " print(f\"Lỗi xử lý drawing {drawing_filename}: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", " else:\n", " print(f\"Thông tin: Không tìm thấy thư mục drawings: {drawings_folder}\")\n", "\n", "\n", " global_data = {\n", " \"shared_tree\": shared_tree,\n", " \"sheet_trees\": sheet_trees,\n", " \"drawing_trees\": drawing_trees, # Thêm dòng này\n", " \"shared_strings_path\": shared_strings_path,\n", " \"worksheets_folder\": worksheets_folder,\n", " \"drawings_folder\": drawings_folder # Thêm dòng này\n", " }\n", " return modifiable_nodes, global_data\\\n", "\n", "\n", "\n", "def zip_folder_to_excel_file(folder_path, file_name):\n", " try:\n", " # Nén thư mục thành file .xlsx trong RAM\n", " xlsx_buffer = io.BytesIO()\n", " with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:\n", " for root, _, files in os.walk(folder_path):\n", " for file in files:\n", " file_path = os.path.join(root, file)\n", " archive_path = os.path.relpath(file_path, folder_path)\n", " zipf.write(file_path, archive_path)\n", "\n", " xlsx_buffer.seek(0)\n", "\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client['excel'] \n", " fs = gridfs.GridFS(db, collection='final_file')\n", "\n", " file_id = fs.put(xlsx_buffer.read(), filename=file_name)\n", " print(f\"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}\")\n", " return file_id\n", "\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}\")\n", " return None\n", " \n", "\n", "\n", "def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):\n", " \n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client['excel']\n", " fs = gridfs.GridFS(db, collection='root_file')\n", " \n", " ppt_file = fs.get(file_id)\n", " excel_file = BytesIO(ppt_file.read())\n", "\n", " xml_folder = unzip_office_file(excel_file)\n", " path_to_workbook_xml = os.path.join(xml_folder, \"xl\", \"workbook.xml\")\n", " translate_sheet_names_via_regex(path_to_workbook_xml, source_lang, target_lang)\n", "\n", " modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)\n", "\n", " original_texts = get_text_list_from_nodes(modifiable_nodes)\n", "\n", " all_results = [None] * len(original_texts)\n", " current_index = 0\n", " processed_count = 0\n", " api_call_counter = 0 # Track API calls for delay logic\n", "\n", " while current_index < len(original_texts):\n", " batch_texts_to_translate = []\n", " batch_original_indices = [] # 0-based indices for assignment\n", " batch_end_index = min(current_index + batch_size_segments, len(original_texts))\n", " found_long_segment_at = -1 # 0-based index in original_texts\n", "\n", " # 1. Build the next potential batch, stopping if a long segment is found\n", " for i in range(current_index, batch_end_index):\n", " segment = original_texts[i]\n", " word_count = count_words(segment)\n", "\n", " if word_count <= max_words_per_segment:\n", " batch_texts_to_translate.append(segment)\n", " batch_original_indices.append(i)\n", " else:\n", " found_long_segment_at = i\n", " break # Stop building this batch\n", "\n", " # --- Process the findings ---\n", "\n", " # 2. Translate the VALID batch collected *before* the long segment (if any)\n", " if batch_texts_to_translate:\n", " # Add delay BEFORE the API call if it's not the very first call\n", " if api_call_counter > 0 and delay_between_requests > 0:\n", " time.sleep(delay_between_requests)\n", "\n", " translated_batch = _translate_batch_helper(\n", " batch_texts_to_translate,\n", " [idx + 1 for idx in batch_original_indices], # 1-based for logging\n", " source_lang,\n", " target_lang\n", " )\n", " api_call_counter += 1\n", " # Assign results back\n", " for batch_idx, original_idx in enumerate(batch_original_indices):\n", " all_results[original_idx] = translated_batch[batch_idx]\n", " processed_count += len(batch_texts_to_translate)\n", "\n", " # 3. Handle the long segment INDIVIDUALLY (if one was found)\n", " if found_long_segment_at != -1:\n", " long_segment_index = found_long_segment_at\n", " long_segment_text = str(original_texts[long_segment_index])\n", " # word_count = count_words(long_segment_text) # Recalculate for log clarity\n", "\n", " try:\n", " translated = translate_single_text(long_segment_text, source_lang, target_lang)\n", " \n", " final = [translated]\n", " api_call_counter += 1\n", "\n", " if len(final) == 1:\n", " all_results[long_segment_index] = final[0]\n", " else:\n", " print(f\" *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.\")\n", " all_results[long_segment_index] = \"\"\n", "\n", " except Exception as e:\n", " print(f\" *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.\")\n", " # traceback.print_exc() # Uncomment for detailed debug\n", " all_results[long_segment_index] = \"\"\n", " # Do not increment api_call_counter if the API call itself failed before returning\n", "\n", " processed_count += 1\n", " # Update current_index to start AFTER this long segment\n", " current_index = long_segment_index + 1\n", "\n", " else:\n", " # No long segment was found in the range checked.\n", " # Move current_index to the end of the range examined.\n", " current_index = batch_end_index\n", "\n", " missing_count = 0\n", " final_texts_for_nodes = []\n", " for i, res in enumerate(all_results):\n", " if res is None:\n", " print(f\"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'\")\n", " final_texts_for_nodes.append(original_texts[i])\n", " missing_count += 1\n", " else:\n", " final_texts_for_nodes.append(res)\n", "\n", " if missing_count > 0:\n", " print(f\"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.\")\n", "\n", " if len(final_texts_for_nodes) != len(original_texts):\n", " print(f\"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.\")\n", " else:\n", " # Gán vào node\n", " for i, node_info in enumerate(modifiable_nodes):\n", " node_info['modified_text'] = final_texts_for_nodes[i]\n", " \n", " save_success = apply_and_save_changes(modifiable_nodes, global_data)\n", " if not save_success:\n", " print(\"LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.\")\n", " else:\n", " # Only zip if saving XML was successful\n", " final_id = zip_folder_to_excel_file(xml_folder, file_name)\n", " if final_id:\n", " shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping\n", " else:\n", " print(\"LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.\")\n", " return final_id" ] } ], "metadata": { "kernelspec": { "display_name": "machine_translate", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }