{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pymongo import MongoClient\n", "import gridfs\n", "from bson import ObjectId\n", "import os\n", "from pptx import Presentation\n", "from concurrent.futures import ThreadPoolExecutor\n", "from xml.dom import minidom\n", "import xml.etree.ElementTree as ET\n", "# from pptx_object import get_table_properties, get_shape_properties\n", "from pptx.enum.shapes import MSO_SHAPE_TYPE\n", "from typing import Dict, List, Optional\n", "import json\n", "# from translator import translate_text_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Delete file in DB" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def delete_all_files_in_collection(collection_name, db_name=\"ppt\"):\n", " \"\"\"\n", " Xóa toàn bộ file trong GridFS của MongoDB.\n", "\n", " :param collection_name: Tên collection chứa file cần xoá (ví dụ: 'original_pptx', 'original_xml', 'final_xml')\n", " :param db_name: Tên database MongoDB (mặc định: 'ppt')\n", " \"\"\"\n", " try:\n", " # Kết nối MongoDB\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", "\n", " # Khởi tạo GridFS với collection được chỉ định\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " # Lấy danh sách file_id của tất cả file trong GridFS\n", " file_ids = [file[\"_id\"] for file in db[f\"{collection_name}.files\"].find({})]\n", "\n", " # Xóa từng file trong GridFS\n", " for file_id in file_ids:\n", " fs.delete(file_id)\n", "\n", " print(f\"✅ Đã xóa {len(file_ids)} file trong collection '{collection_name}'\")\n", "\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi xóa file: {str(e)}\")\n", "\n", " finally:\n", " client.close()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa 3 file trong collection 'final_pptx'\n" ] } ], "source": [ "delete_all_files_in_collection(\"final_pptx\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def delete_pptx_from_mongodb(file_id, db_name=\"ppt\", collection_name=\"_xml\"):\n", " \"\"\"\n", " Xóa file PowerPoint khỏi MongoDB theo ID.\n", "\n", " :param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)\n", " :param db_name: Tên database trong MongoDB\n", " :param collection_name: Tên collection GridFS\n", " \"\"\"\n", " # Kết nối đến MongoDB\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " try:\n", " # Chuyển đổi ID nếu cần\n", " if not isinstance(file_id, ObjectId):\n", " file_id = ObjectId(file_id)\n", "\n", " # Kiểm tra file có tồn tại không\n", " if fs.exists(file_id):\n", " fs.delete(file_id)\n", " print(f\"✅ Đã xóa file với ID: {file_id}\")\n", " else:\n", " print(f\"⚠️ Không tìm thấy file với ID: {file_id}\")\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi xóa file: {e}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download file from DB" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def download_input_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n", " os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\input\", exist_ok=True)\n", "\n", " full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\input\", save_name)\n", "\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " try:\n", " if not isinstance(file_id, ObjectId):\n", " file_id = ObjectId(file_id)\n", "\n", " file_data = fs.get(file_id)\n", " \n", " with open(full_file_path, \"wb\") as f:\n", " f.write(file_data.read())\n", "\n", " print(f\"✅ File đã được tải về: {full_file_path}\")\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi tải file: {e}\")\n", " finally:\n", " client.close()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def download_output_from_mongodb(file_id, save_name, db_name=\"ppt\", collection_name=\"root_file\"):\n", " os.makedirs(\"D:\\Show_me_everything\\Machine Translation\\output\", exist_ok=True)\n", "\n", " full_file_path = os.path.join(\"D:\\Show_me_everything\\Machine Translation\\output\", save_name)\n", "\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[db_name]\n", " fs = gridfs.GridFS(db, collection=collection_name)\n", "\n", " try:\n", " if not isinstance(file_id, ObjectId):\n", " file_id = ObjectId(file_id)\n", "\n", " file_data = fs.get(file_id)\n", " \n", " with open(full_file_path, \"wb\") as f:\n", " f.write(file_data.read())\n", "\n", " print(f\"✅ File đã được tải về: {full_file_path}\")\n", " except Exception as e:\n", " print(f\"❌ Lỗi khi tải file: {e}\")\n", " finally:\n", " client.close()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\input\\syllabus_big_data_analytics.docx\n" ] } ], "source": [ "download_input_from_mongodb(file_id=\"67f725830f818c0a963a9421\", save_name=\"syllabus_big_data_analytics.docx\", db_name=\"word\", collection_name=\"root_file\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ File đã được tải về: D:\\Show_me_everything\\Machine Translation\\output\\syllabus_big_data_analytics.docx\n" ] } ], "source": [ "download_output_from_mongodb(file_id=\"67f725a10f818c0a963a9425\", save_name=\"syllabus_big_data_analytics.docx\", db_name=\"word\", collection_name=\"final_file\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# List all file " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa 4 file trong collection 'root_file'\n", "✅ Đã xóa 0 file trong collection 'final_pptx'\n", "✅ Đã xóa 1 file trong collection 'original_xml'\n", "✅ Đã xóa 1 file trong collection 'final_xml'\n" ] } ], "source": [ "for i in ['root_file', 'final_pptx', 'original_xml', 'final_xml']:\n", "\n", " delete_all_files_in_collection(i, db_name = 'pptx')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa 3 file trong collection 'root_file'\n", "✅ Đã xóa 2 file trong collection 'final_file'\n" ] } ], "source": [ "for i in ['root_file', 'final_file']:\n", " delete_all_files_in_collection(i, db_name=\"word\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def file_list(collection=\"root_file\"):\n", " client = MongoClient(\"mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0\")\n", " db = client[\"ppt\"]\n", " fs = gridfs.GridFS(db, collection=collection)\n", " for file in fs.find():\n", " print(f\"📂 File: {file.filename} - ID: {file._id}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcd8c575cfef63155d3f91\n", "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcea4f02257ad0cb04610e\n", "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dcead0143da29a5c6321ab\n", "📂 File: Bản sao của Bản theo concept Hồ sơ tài trợ.pptx - ID: 67dd3bf23cf7ee2f6eca902e\n" ] } ], "source": [ "file_list(\"root_file\")\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Đã xóa file với ID: 67d383681d4db191e51b0bd8\n" ] } ], "source": [ "delete_pptx_from_mongodb(file_id='67d383681d4db191e51b0bd8')" ] } ], "metadata": { "kernelspec": { "display_name": "machine_translate", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }