Spaces:
Running
Running
File size: 6,146 Bytes
0e9ff78 b14389a 0e9ff78 6ae64ab 0e9ff78 182876f 6ae64ab 0e9ff78 fad6c52 0e9ff78 182876f 217a617 182876f b14389a 182876f 1b6f99d 182876f 0e9ff78 6ae64ab 0e9ff78 6ae64ab 0e9ff78 d586fe1 0e9ff78 d586fe1 0e9ff78 d586fe1 182876f 217a617 182876f 217a617 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from pymongo import MongoClient
import gridfs
from bson import ObjectId
import os
from io import BytesIO
import magic
from datetime import datetime, timedelta
def connect_mongodb(db_name, collection_name):
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
db = client[db_name]
fs = gridfs.GridFS(db, collection=collection_name)
return fs
def save_file_to_mongodb(uploaded_file, db_name="pptx", collection_name="root_file", max_db_size_mb=500):
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
db = client[db_name]
fs = gridfs.GridFS(db, collection=collection_name)
file_name = uploaded_file.name
uploaded_file.seek(0)
file_bytes = uploaded_file.read()
try:
# Kiểm tra kích thước dữ liệu hiện tại trong DB (bytes → MB)
db_size_mb = get_total_cluster_size(client)
print(f"📦 Database size: {db_size_mb:.2f} MB")
if db_size_mb > max_db_size_mb:
delete_all()
# Sau khi dọn dẹp (nếu cần), tiến hành lưu
file_id = fs.put(file_bytes, filename=file_name)
now = datetime.now() + timedelta(hours=5)
formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")
print(f"{formatted_now} File '{file_name}' đã được lưu vào '{collection_name}' với ID: {file_id} \n")
return file_id, file_name
except Exception as e:
print(f"❌ Lỗi khi lưu file hoặc truy vấn MongoDB: {e} \n")
return None, None
finally:
client.close()
def save_xml_to_gridfs(xml_content, file_name, db_name="ppt", collection_name="original_xml"):
"""
Lưu XML vào MongoDB GridFS.
:param xml_content: Chuỗi XML cần lưu
:param file_name: Tên file XML
:param db_name: Tên database MongoDB
:param collection_name: Tên collection GridFS
"""
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
db = client[db_name]
fs = gridfs.GridFS(db, collection=collection_name)
# Kiểm tra file đã tồn tại chưa
existing_file = fs.find_one({"filename": file_name})
if existing_file:
print(f"⚠️ File '{file_name}' đã tồn tại trong GridFS. Không lưu lại.")
return
# Chuyển đổi chuỗi XML thành bytes và lưu vào GridFS
file_id = fs.put(xml_content.encode("utf-8"), filename=file_name)
print(f"✅ XML '{file_name}' đã được lưu vào GridFS với ID: {file_id}")
def fetch_file_from_mongodb(db_name, collection_name, file_id):
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0") # Cập nhật nếu cần
db = client[db_name]
fs = gridfs.GridFS(db, collection_name)
try:
file_data = fs.get(file_id)
pptx_io = BytesIO(file_data.read())
pptx_io.seek(0) # Đặt lại vị trí đầu file
return pptx_io, file_data.filename
except Exception as e:
print(f"Lỗi khi lấy file từ MongoDB: {e}")
return None, None
def detect_file_type(uploaded_file):
if uploaded_file is not None:
try:
# Ưu tiên kiểm tra phần mở rộng trước
ext = os.path.splitext(uploaded_file.name)[1].lower()
ext_mapping = {
".csv": "CSV", ".docx": "Word", ".doc": "Word",
".xlsx": "Excel", ".pptx": "PPTX", ".pdf": "PDF"
}
detected_type = ext_mapping.get(ext)
if detected_type:
return detected_type # Nếu có trong danh sách, trả về ngay
# Nếu không có phần mở rộng hợp lệ, fallback vào MIME type
file_bytes = uploaded_file.read(4096)
mime = magic.Magic(mime=True)
file_type = mime.from_buffer(file_bytes)
mime_types = {
"application/pdf": "PDF",
"application/vnd.ms-powerpoint": "PPTX",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "PPTX",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel",
"application/vnd.ms-excel": "Excel",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word",
"application/msword": "Word",
"text/csv": "CSV",
"text/plain": "CSV"
}
return mime_types.get(file_type, "Unknown")
except Exception as e:
print(f"Error detecting file type: {e}")
return "Unknown"
return None
def delete_all_files_in_collection(collection_name, db_name="ppt"):
try:
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
db = client[db_name]
fs = gridfs.GridFS(db, collection=collection_name)
file_ids = [file["_id"] for file in db[f"{collection_name}.files"].find({})]
for file_id in file_ids:
fs.delete(file_id)
print(f"✅ Đã xóa {len(file_ids)} file trong collection '{collection_name}' của db '{db_name}'")
except Exception as e:
print(f"❌ Lỗi khi xóa file: {str(e)}")
finally:
client.close()
def delete_all():
for i in ['root_file', 'final_file']:
for j in ['word', 'exce', 'pptx', 'csv']:
delete_all_files_in_collection(i, db_name=j)
def get_total_cluster_size(client):
total_size = 0
try:
for db_name in ['word', 'exce', 'pptx', 'csv']:
db = client[db_name]
stats = db.command("dbstats")
db_size = stats.get("dataSize", 0)
total_size += db_size
except Exception as e:
print(f"❌ Lỗi khi tính dung lượng cluster: {e}")
return -1
return total_size / (1024 ** 2) |