Spaces:
Sleeping
Sleeping
Add application file
Browse files- .gitignore +207 -0
- app.py +274 -0
- config/database_configs.py +0 -0
- config/model_configs.py +17 -0
- config/settings.py +36 -0
- core/data_processing/audio_processor.py +80 -0
- core/data_processing/image_processor.py +89 -0
- core/data_processing/text_processor.py +64 -0
- core/data_processing/video_processor.py +118 -0
- core/embeddings/audio_embedding_model.py +86 -0
- core/embeddings/image_embedding_model.py +80 -0
- core/embeddings/text_embedding_model.py +42 -0
- core/retrieval/retriever.py +156 -0
- core/retrieval/vector_db_manager.py +169 -0
- requirements.txt +125 -0
- scripts/ingest_data.py +203 -0
- scripts/ingestion.py +227 -0
- utils/logger.py +24 -0
.gitignore
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
app.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/main.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import shutil
|
| 6 |
+
import zipfile
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Thêm thư mục gốc của dự án vào Python Path để có thể import các module
|
| 11 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
| 12 |
+
if project_root not in sys.path:
|
| 13 |
+
sys.path.insert(0, project_root)
|
| 14 |
+
|
| 15 |
+
from utils.logger import logger
|
| 16 |
+
from config.settings import settings
|
| 17 |
+
from qdrant_client import QdrantClient
|
| 18 |
+
from core.retrieval.retriever import Retriever
|
| 19 |
+
from scripts.ingestion import IngestionService
|
| 20 |
+
|
| 21 |
+
# --- 1. Khởi tạo các dịch vụ toàn cục ---
|
| 22 |
+
logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
|
| 23 |
+
try:
|
| 24 |
+
# Tạo MỘT QdrantClient duy nhất để chia sẻ
|
| 25 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 26 |
+
shared_qdrant_client = QdrantClient(path=qdrant_db_path)
|
| 27 |
+
logger.info("Shared Qdrant client initialized.")
|
| 28 |
+
|
| 29 |
+
# Khởi tạo các dịch vụ, chia sẻ client
|
| 30 |
+
ingestion_service = IngestionService(client=shared_qdrant_client)
|
| 31 |
+
retriever_instance = Retriever(client=shared_qdrant_client)
|
| 32 |
+
|
| 33 |
+
logger.info("All services initialized successfully.")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
logger.error(f"Failed to initialize global services: {e}")
|
| 36 |
+
raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ---- HÀM XỬ LÝ CHO TAB UPLOAD ----
|
| 40 |
+
def upload_handler(zip_path: str, progress=gr.Progress()):
|
| 41 |
+
"""
|
| 42 |
+
Hàm này xử lý việc upload file và thư mục với progress bar.
|
| 43 |
+
"""
|
| 44 |
+
progress(0, desc="🚀 Starting upload process...")
|
| 45 |
+
|
| 46 |
+
if not zip_path:
|
| 47 |
+
return "Error: No file uploaded"
|
| 48 |
+
|
| 49 |
+
if not zip_path.endswith(".zip"):
|
| 50 |
+
return "Error: Please upload a zip file"
|
| 51 |
+
|
| 52 |
+
progress(0.05, desc="📦 Extracting ZIP file...")
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
| 56 |
+
zip_ref.extractall(settings.RAW_DATA_DIR)
|
| 57 |
+
except zipfile.BadZipFile:
|
| 58 |
+
return "Invalid ZIP file."
|
| 59 |
+
except Exception as e:
|
| 60 |
+
return f"Error during extraction: {str(e)}"
|
| 61 |
+
|
| 62 |
+
progress(0.15, desc="🔍 Scanning for files...")
|
| 63 |
+
|
| 64 |
+
logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
|
| 65 |
+
|
| 66 |
+
# --- Bước 1: Thu thập tất cả các đường dẫn file từ input ---
|
| 67 |
+
path = Path(settings.RAW_DATA_DIR)
|
| 68 |
+
all_temp_file_paths = list(path.rglob("*"))
|
| 69 |
+
all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
|
| 70 |
+
|
| 71 |
+
progress(0.25, desc="📊 Analyzing files...")
|
| 72 |
+
|
| 73 |
+
if not all_temp_file_paths:
|
| 74 |
+
return "No valid files found in the uploaded items."
|
| 75 |
+
|
| 76 |
+
logger.info(f"Found a total of {len(all_temp_file_paths)} files to process.")
|
| 77 |
+
progress(0.35, desc=f"✅ Found {len(all_temp_file_paths)} files to process...")
|
| 78 |
+
|
| 79 |
+
files_to_ingest = all_temp_file_paths.copy()
|
| 80 |
+
|
| 81 |
+
if not files_to_ingest:
|
| 82 |
+
return "No valid files were moved for ingestion."
|
| 83 |
+
|
| 84 |
+
# --- Bước 3: Gọi dịch vụ để nạp tất cả các file một lần ---
|
| 85 |
+
try:
|
| 86 |
+
progress(0.4, desc="🔄 Starting file ingestion...")
|
| 87 |
+
# Gọi hàm ingestion với progress callback
|
| 88 |
+
ingestion_service.ingest_files_with_progress(files_to_ingest)
|
| 89 |
+
|
| 90 |
+
success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
|
| 91 |
+
logger.success(success_message)
|
| 92 |
+
return success_message
|
| 93 |
+
except Exception as e:
|
| 94 |
+
error_message = f"An error occurred during the ingestion process: {e}"
|
| 95 |
+
logger.error(error_message)
|
| 96 |
+
return error_message
|
| 97 |
+
|
| 98 |
+
# ---- HÀM XỬ LÝ CHO TAB SEARCH ----
|
| 99 |
+
def search_handler(text_query: str, image_query_path: str, audio_query_path: str, top_k: int):
|
| 100 |
+
def create_empty_updates(max_results=10):
|
| 101 |
+
updates = []
|
| 102 |
+
for _ in range(max_results):
|
| 103 |
+
# Chỉ 5 components cho mỗi result: group, markdown, textbox, image, audio
|
| 104 |
+
updates.extend([
|
| 105 |
+
gr.Group(visible=False),
|
| 106 |
+
gr.Markdown(visible=False),
|
| 107 |
+
gr.Textbox(visible=False),
|
| 108 |
+
gr.Image(visible=False),
|
| 109 |
+
gr.Audio(visible=False)
|
| 110 |
+
])
|
| 111 |
+
return updates
|
| 112 |
+
|
| 113 |
+
# Kiểm tra database trước khi xử lý query
|
| 114 |
+
try:
|
| 115 |
+
if retriever_instance.is_database_empty():
|
| 116 |
+
empty_db_message = gr.Textbox(
|
| 117 |
+
value="Database is empty. Please go to the 'Upload Data' tab to add files first.",
|
| 118 |
+
visible=True
|
| 119 |
+
)
|
| 120 |
+
return [empty_db_message] + create_empty_updates()
|
| 121 |
+
except Exception as e:
|
| 122 |
+
error_message = gr.Textbox(
|
| 123 |
+
value=f"Error checking database: {str(e)}",
|
| 124 |
+
visible=True
|
| 125 |
+
)
|
| 126 |
+
return [error_message] + create_empty_updates()
|
| 127 |
+
|
| 128 |
+
query_type, query_content = None, None
|
| 129 |
+
if text_query and text_query.strip():
|
| 130 |
+
query_type, query_content = "text", text_query
|
| 131 |
+
elif image_query_path:
|
| 132 |
+
query_type, query_content = "image", image_query_path
|
| 133 |
+
elif audio_query_path:
|
| 134 |
+
query_type, query_content = "audio", audio_query_path
|
| 135 |
+
|
| 136 |
+
max_results = 10 # Phải khớp với số component đã tạo
|
| 137 |
+
|
| 138 |
+
if not query_type:
|
| 139 |
+
return [gr.Textbox(value="Error: Please provide a query.", visible=True)] + create_empty_updates()
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
logger.info(f"Handling '{query_type}' query: {query_content}")
|
| 143 |
+
results = retriever_instance.retrieve(query_content, query_type, int(top_k))
|
| 144 |
+
|
| 145 |
+
if not results:
|
| 146 |
+
return [gr.Textbox(value="No results found.", visible=True)] + create_empty_updates()
|
| 147 |
+
|
| 148 |
+
output_updates = [gr.Textbox(value="", visible=False)] # Ẩn ô info_box
|
| 149 |
+
for i in range(max_results):
|
| 150 |
+
if i < len(results):
|
| 151 |
+
res = results[i]
|
| 152 |
+
score, metadata, content = res['score'], res['metadata'], res.get('content')
|
| 153 |
+
chunk_type, source_id = metadata.get('type', 'N/A'), metadata.get('source_id', 'N/A')
|
| 154 |
+
info_text = f"### Result {i + 1} (Score: {score:.4f})\n**Type:** `{chunk_type}` | **Source:** `{source_id}`"
|
| 155 |
+
|
| 156 |
+
text_val, text_visible = "", False
|
| 157 |
+
img_val, img_visible = None, False
|
| 158 |
+
audio_val, audio_visible = None, False
|
| 159 |
+
|
| 160 |
+
if chunk_type == 'text':
|
| 161 |
+
text_val, text_visible = content, True
|
| 162 |
+
elif chunk_type == "image":
|
| 163 |
+
if content and os.path.exists(content):
|
| 164 |
+
img_val, img_visible = content, True
|
| 165 |
+
else:
|
| 166 |
+
text_val, text_visible = "`Image content not found at path.`", True
|
| 167 |
+
elif chunk_type == 'audio':
|
| 168 |
+
if content and os.path.exists(content):
|
| 169 |
+
audio_val, audio_visible = content, True
|
| 170 |
+
else:
|
| 171 |
+
text_val, text_visible = "`Audio content not found at path.`", True
|
| 172 |
+
|
| 173 |
+
# Chỉ 5 components cho mỗi result
|
| 174 |
+
output_updates.extend([
|
| 175 |
+
gr.Group(visible=True),
|
| 176 |
+
gr.Markdown(value=info_text, visible=True),
|
| 177 |
+
gr.Textbox(value=text_val, visible=text_visible),
|
| 178 |
+
gr.Image(value=img_val, visible=img_visible),
|
| 179 |
+
gr.Audio(value=audio_val, visible=audio_visible)
|
| 180 |
+
])
|
| 181 |
+
else:
|
| 182 |
+
# Chỉ 5 components cho mỗi result
|
| 183 |
+
output_updates.extend([
|
| 184 |
+
gr.Group(visible=False),
|
| 185 |
+
gr.Markdown(visible=False),
|
| 186 |
+
gr.Textbox(visible=False),
|
| 187 |
+
gr.Image(visible=False),
|
| 188 |
+
gr.Audio(visible=False)
|
| 189 |
+
])
|
| 190 |
+
|
| 191 |
+
return output_updates
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
error_message = f"Error during search: {str(e)}"
|
| 195 |
+
logger.error(error_message)
|
| 196 |
+
return [gr.Textbox(value=error_message, visible=True)] + create_empty_updates()
|
| 197 |
+
|
| 198 |
+
# --- 3. Xây dựng giao diện với Gradio Blocks ---
|
| 199 |
+
def create_and_run_app():
|
| 200 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Multimedia RAG Assistant") as demo:
|
| 201 |
+
gr.Markdown("# Multimedia RAG Assistant")
|
| 202 |
+
|
| 203 |
+
with gr.Tabs() as tabs:
|
| 204 |
+
# --- TAB 1: SEARCH ---
|
| 205 |
+
with gr.TabItem("Search Database", id=0):
|
| 206 |
+
with gr.Row():
|
| 207 |
+
with gr.Column(scale=1):
|
| 208 |
+
gr.Markdown("### Input Query")
|
| 209 |
+
text_query_input = gr.Textbox(label="Text Query", placeholder="e.g., a dog playing in a park")
|
| 210 |
+
image_query_input = gr.Image(label="Image Query", type="filepath")
|
| 211 |
+
audio_query_input = gr.Audio(label="Audio Query", type="filepath")
|
| 212 |
+
top_k_slider = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Top K Results")
|
| 213 |
+
search_button = gr.Button("Search", variant="primary")
|
| 214 |
+
|
| 215 |
+
with gr.Column(scale=2):
|
| 216 |
+
gr.Markdown("### Retrieval Results")
|
| 217 |
+
info_box = gr.Textbox(label="Info", interactive=False, visible=False)
|
| 218 |
+
max_results = 10
|
| 219 |
+
result_components = []
|
| 220 |
+
|
| 221 |
+
for i in range(max_results):
|
| 222 |
+
with gr.Group(visible=False) as result_group:
|
| 223 |
+
result_info = gr.Markdown()
|
| 224 |
+
result_text = gr.Textbox(label="Text Content", interactive=False, visible=False)
|
| 225 |
+
result_image = gr.Image(label="Image Content", interactive=False, visible=False)
|
| 226 |
+
result_audio = gr.Audio(label="Audio Content", visible=False, type="filepath")
|
| 227 |
+
# Chỉ thêm 5 components cho mỗi result
|
| 228 |
+
result_components.extend([result_group, result_info, result_text, result_image, result_audio])
|
| 229 |
+
|
| 230 |
+
all_outputs = [info_box] + result_components
|
| 231 |
+
search_button.click(
|
| 232 |
+
fn=search_handler,
|
| 233 |
+
inputs=[text_query_input, image_query_input, audio_query_input, top_k_slider],
|
| 234 |
+
outputs=all_outputs
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# --- TAB 2: UPLOAD ---
|
| 238 |
+
with gr.TabItem("Upload Data", id=1):
|
| 239 |
+
gr.Markdown("### Upload New Data to the Database")
|
| 240 |
+
gr.Markdown("You can upload multiple files of different types at once (text, images, audio), or drop a folder.")
|
| 241 |
+
with gr.Column():
|
| 242 |
+
upload_file_input = gr.File(
|
| 243 |
+
label="Upload ZIP file containing your data",
|
| 244 |
+
file_types=[".zip"],
|
| 245 |
+
file_count="single",
|
| 246 |
+
type="filepath"
|
| 247 |
+
)
|
| 248 |
+
upload_button = gr.Button("Upload and Ingest", variant="primary")
|
| 249 |
+
upload_status = gr.Textbox(label="Status", interactive=False, placeholder="Upload status will be shown here...")
|
| 250 |
+
|
| 251 |
+
upload_button.click(
|
| 252 |
+
fn=upload_handler,
|
| 253 |
+
inputs=[upload_file_input],
|
| 254 |
+
outputs=[upload_status],
|
| 255 |
+
show_progress="full" # Hiển thị progress bar
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Xử lý sự kiện để xóa các input khác trong tab Search
|
| 259 |
+
def clear_search_inputs(input_type):
|
| 260 |
+
if input_type == 'text': return gr.Image(value=None), gr.Audio(value=None)
|
| 261 |
+
elif input_type == 'image': return gr.Textbox(value=""), gr.Audio(value=None)
|
| 262 |
+
elif input_type == 'audio': return gr.Textbox(value=""), gr.Image(value=None)
|
| 263 |
+
|
| 264 |
+
text_query_input.change(lambda: clear_search_inputs('text'), outputs=[image_query_input, audio_query_input], queue=False)
|
| 265 |
+
image_query_input.change(lambda: clear_search_inputs('image'), outputs=[text_query_input, audio_query_input], queue=False)
|
| 266 |
+
audio_query_input.change(lambda: clear_search_inputs('audio'), outputs=[text_query_input, image_query_input], queue=False)
|
| 267 |
+
|
| 268 |
+
return demo
|
| 269 |
+
|
| 270 |
+
# --- 4. Chạy ứng dụng ---
|
| 271 |
+
if __name__ == "__main__":
|
| 272 |
+
logger.info("Launching Gradio interface...")
|
| 273 |
+
demo = create_and_run_app()
|
| 274 |
+
demo.launch()
|
config/database_configs.py
ADDED
|
File without changes
|
config/model_configs.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config/model_configs.py
|
| 2 |
+
|
| 3 |
+
# Embedding Models
|
| 4 |
+
TEXT_EMBEDDING_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 5 |
+
IMAGE_EMBEDDING_MODEL: str = "openai/clip-vit-base-patch32" # Hoặc các mô hình CLIP khác
|
| 6 |
+
AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused" # Ví dụ về mô hình CLAP
|
| 7 |
+
|
| 8 |
+
# Generator Model (LLM/LMM)
|
| 9 |
+
GENERATOR_MODEL_NAME: str = "gpt-4o" # Hoặc "google/gemma-2b", "meta-llama/Llama-2-7b-chat-hf", "llava-hf/llava-1.5-7b-hf"
|
| 10 |
+
GENERATOR_MODEL_MAX_TOKENS: int = 4096
|
| 11 |
+
GENERATOR_MODEL_TEMPERATURE: float = 0.7
|
| 12 |
+
|
| 13 |
+
# Reranker Model
|
| 14 |
+
RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 15 |
+
|
| 16 |
+
# Automatic Speech Recognition (ASR) Model (Ví dụ với Whisper của Hugging Face)
|
| 17 |
+
ASR_MODEL: str = "openai/whisper-tiny" # Có thể dùng "base", "small", "medium" tùy tài nguyên GPU
|
config/settings.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 2 |
+
import os
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
class Settings(BaseSettings):
|
| 9 |
+
APP_NAME: str = "Multimedia RAG Assistant"
|
| 10 |
+
APP_VERSION: str = "0.1.0"
|
| 11 |
+
ENVIRONMENT: str = os.getenv('ENVIRONMENT')
|
| 12 |
+
|
| 13 |
+
DATA_DIR: str = "data"
|
| 14 |
+
RAW_DATA_DIR: str = os.path.join("data", "raw")
|
| 15 |
+
PROCESSED_DATA_DIR: str = os.path.join("data", "processed")
|
| 16 |
+
CHUNKS_DIR: str = os.path.join("data", "processed", "chunks")
|
| 17 |
+
METADATA_DIR: str = os.path.join("data", "processed", "metadata")
|
| 18 |
+
EMBEDDINGS_DIR: str = os.path.join("data", "processed", "embeddings")
|
| 19 |
+
|
| 20 |
+
API_HOST: str = "0.0.0.0"
|
| 21 |
+
API_PORT: int = 8000
|
| 22 |
+
|
| 23 |
+
# Cấu hình mô hình
|
| 24 |
+
# Đây là nơi bạn sẽ thêm các API key hoặc model IDs sau này
|
| 25 |
+
HUGGINGFACE_API_KEY: Optional[str] = os.getenv('HUGGINGFACE_API_KEY') # Ví dụ: Nếu dùng Hugging Face models
|
| 26 |
+
|
| 27 |
+
# Cấu hình logger
|
| 28 |
+
LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
| 29 |
+
|
| 30 |
+
model_config = SettingsConfigDict(
|
| 31 |
+
env_file=".env",
|
| 32 |
+
extra="ignore",
|
| 33 |
+
case_sensitive=True
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
settings = Settings()
|
core/data_processing/audio_processor.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/data_processing/audio_processor.py
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
from utils.logger import logger
|
| 6 |
+
from pydub import AudioSegment
|
| 7 |
+
from pydub.silence import split_on_silence
|
| 8 |
+
from config.settings import settings
|
| 9 |
+
|
| 10 |
+
class AudioProcessor:
|
| 11 |
+
def __init__(self, min_silence_len: int = 1000, silence_thresh_db: int = -40, target_sr: int = 16000):
|
| 12 |
+
self.min_silence_len = min_silence_len
|
| 13 |
+
self.silence_thresh_db = silence_thresh_db
|
| 14 |
+
self.target_sr = target_sr
|
| 15 |
+
logger.info(f"AudioProcessor initialized (min_silence_len={min_silence_len}ms, silence_thresh_db={silence_thresh_db}dB).")
|
| 16 |
+
|
| 17 |
+
def process(self, file_path: str) -> List[Dict[str, Any]]:
|
| 18 |
+
try:
|
| 19 |
+
logger.info(f"Processing audio file: {file_path}")
|
| 20 |
+
audio = AudioSegment.from_file(file_path)
|
| 21 |
+
|
| 22 |
+
if audio.frame_rate != self.target_sr:
|
| 23 |
+
audio = audio.set_frame_rate(self.target_sr)
|
| 24 |
+
|
| 25 |
+
audio_segments = split_on_silence(
|
| 26 |
+
audio,
|
| 27 |
+
min_silence_len=self.min_silence_len,
|
| 28 |
+
silence_thresh=self.silence_thresh_db,
|
| 29 |
+
keep_silence=500
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
chunks = []
|
| 33 |
+
audio_chunks_dir = os.path.join(settings.CHUNKS_DIR, "audio")
|
| 34 |
+
os.makedirs(audio_chunks_dir, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
for i, segment in enumerate(audio_segments):
|
| 37 |
+
segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
|
| 38 |
+
chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
|
| 39 |
+
|
| 40 |
+
# Lưu segment thành file WAV tạm thời
|
| 41 |
+
segment.export(chunk_file_path, format="wav")
|
| 42 |
+
|
| 43 |
+
metadata = {
|
| 44 |
+
"source_id": os.path.basename(file_path),
|
| 45 |
+
"type": "audio",
|
| 46 |
+
"chunk_id": segment_id,
|
| 47 |
+
"chunk_data_path": chunk_file_path,
|
| 48 |
+
# "start_time_ms": int(segment.start_time),
|
| 49 |
+
# "end_time_ms": int(segment.end_time),
|
| 50 |
+
"duration_ms": len(segment)
|
| 51 |
+
}
|
| 52 |
+
chunks.append({
|
| 53 |
+
"content": chunk_file_path,
|
| 54 |
+
"metadata": metadata
|
| 55 |
+
})
|
| 56 |
+
logger.info(f"Generated {len(chunks)} audio segments from {file_path}")
|
| 57 |
+
return chunks
|
| 58 |
+
except FileNotFoundError:
|
| 59 |
+
logger.error(f"Audio file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
|
| 60 |
+
return []
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Error processing audio file {file_path}: {e}")
|
| 63 |
+
return []
|
| 64 |
+
|
| 65 |
+
# Ví dụ sử dụng (giữ nguyên để kiểm tra)
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
sample_audio_path = os.path.join(settings.RAW_DATA_DIR, "audios", "sample_audio.wav")
|
| 68 |
+
if not os.path.exists(sample_audio_path):
|
| 69 |
+
print(f"ERROR: Sample audio not found at {sample_audio_path}. Please create it first.")
|
| 70 |
+
print("Make sure you have ffmpeg installed and available in your PATH for pydub to work.")
|
| 71 |
+
else:
|
| 72 |
+
processor = AudioProcessor()
|
| 73 |
+
audio_chunks = processor.process(sample_audio_path)
|
| 74 |
+
|
| 75 |
+
for i, chunk in enumerate(audio_chunks):
|
| 76 |
+
print(f"\n--- Audio Chunk {i+1} ---")
|
| 77 |
+
print(f"Type: {chunk['metadata']['type']}")
|
| 78 |
+
print(f"Content (path): {chunk['content']}")
|
| 79 |
+
print(f"Metadata: {chunk['metadata']}")
|
| 80 |
+
# Bạn có thể thử mở file chunk['content'] để nghe
|
core/data_processing/image_processor.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/data_processing/image_processor.py
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
import os
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from utils.logger import logger
|
| 6 |
+
|
| 7 |
+
class ImageProcessor:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
logger.info("ImageProcessor initialized.")
|
| 10 |
+
|
| 11 |
+
def process(self, file_path: str) -> List[Dict[str, Any]]:
|
| 12 |
+
try:
|
| 13 |
+
logger.debug(f"Processing image file: {file_path}")
|
| 14 |
+
|
| 15 |
+
if not os.path.exists(file_path):
|
| 16 |
+
logger.error(f"Image file not found: {file_path}")
|
| 17 |
+
return []
|
| 18 |
+
|
| 19 |
+
with Image.open(file_path) as img:
|
| 20 |
+
width, height = img.size
|
| 21 |
+
img_format = img.format
|
| 22 |
+
|
| 23 |
+
file_size = os.path.getsize(file_path)
|
| 24 |
+
|
| 25 |
+
# Tạo một ID duy nhất cho chunk này
|
| 26 |
+
# Lấy tên file không bao gồm phần mở rộng
|
| 27 |
+
base_name = os.path.basename(file_path)
|
| 28 |
+
chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
|
| 29 |
+
|
| 30 |
+
metadata = {
|
| 31 |
+
"source_id": base_name,
|
| 32 |
+
"type": "image",
|
| 33 |
+
"chunk_id": chunk_id,
|
| 34 |
+
"chunk_data_path": file_path,
|
| 35 |
+
"image_width": width,
|
| 36 |
+
"image_height": height,
|
| 37 |
+
"image_format": img_format,
|
| 38 |
+
"file_size_bytes": file_size
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Tạo chunk
|
| 42 |
+
# Content sẽ là đường dẫn đến file, giống như audio/video segments
|
| 43 |
+
chunk = {
|
| 44 |
+
"content": file_path,
|
| 45 |
+
"metadata": metadata
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Trả về một danh sách chứa một chunk duy nhất
|
| 49 |
+
return [chunk]
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"Error processing image file {file_path}: {e}")
|
| 53 |
+
return []
|
| 54 |
+
|
| 55 |
+
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
from config.settings import settings
|
| 58 |
+
import os
|
| 59 |
+
|
| 60 |
+
# Tạo một ảnh dummy để kiểm tra
|
| 61 |
+
dummy_image_dir = os.path.join(settings.RAW_DATA_DIR, "images")
|
| 62 |
+
os.makedirs(dummy_image_dir, exist_ok=True)
|
| 63 |
+
dummy_image_path = os.path.join(dummy_image_dir, "test_image.jpg")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Tạo một ảnh mẫu màu xanh
|
| 67 |
+
dummy_img = Image.new('RGB', (100, 150), color = 'blue')
|
| 68 |
+
dummy_img.save(dummy_image_path)
|
| 69 |
+
print(f"Created a dummy image for testing at: {dummy_image_path}")
|
| 70 |
+
|
| 71 |
+
# Khởi tạo processor và xử lý ảnh
|
| 72 |
+
processor = ImageProcessor()
|
| 73 |
+
image_chunks = processor.process(dummy_image_path)
|
| 74 |
+
|
| 75 |
+
if image_chunks:
|
| 76 |
+
print("\n--- Image Chunk Processed ---")
|
| 77 |
+
chunk = image_chunks[0]
|
| 78 |
+
print(f"Content (path): {chunk['content']}")
|
| 79 |
+
print("Metadata:")
|
| 80 |
+
for key, value in chunk['metadata'].items():
|
| 81 |
+
print(f" - {key}: {value}")
|
| 82 |
+
else:
|
| 83 |
+
print("Failed to process the dummy image.")
|
| 84 |
+
|
| 85 |
+
finally:
|
| 86 |
+
# Dọn dẹp ảnh dummy
|
| 87 |
+
if os.path.exists(dummy_image_path):
|
| 88 |
+
os.remove(dummy_image_path)
|
| 89 |
+
print(f"Cleaned up dummy image: {dummy_image_path}")
|
core/data_processing/text_processor.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
from utils.logger import logger
|
| 5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 6 |
+
|
| 7 |
+
class TextProcessor:
|
| 8 |
+
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
|
| 9 |
+
self.chunk_size = chunk_size
|
| 10 |
+
self.chunk_overlap = chunk_overlap
|
| 11 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 12 |
+
chunk_size=self.chunk_size,
|
| 13 |
+
chunk_overlap=self.chunk_overlap,
|
| 14 |
+
length_function=len, # count character, can be replaced
|
| 15 |
+
add_start_index=True #
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
logger.info(f"TextProcessor initialized with LangChain's RecursiveCharacterTextSplitter (chunk_size={chunk_size}, chunk_overlap={chunk_overlap})")
|
| 19 |
+
|
| 20 |
+
def process(self, file_path: str) -> List[Dict[str, Any]]:
|
| 21 |
+
try:
|
| 22 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 23 |
+
text = f.read()
|
| 24 |
+
logger.info(f"Processing text document: {file_path}")
|
| 25 |
+
split_texts = self.text_splitter.split_text(text)
|
| 26 |
+
|
| 27 |
+
chunks = []
|
| 28 |
+
for i, chunk_content in enumerate(split_texts):
|
| 29 |
+
start_char_idx = text.find(chunk_content) # find start index of each chunk_content
|
| 30 |
+
chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
|
| 31 |
+
metadata = {
|
| 32 |
+
"source_id": os.path.basename(file_path),
|
| 33 |
+
"type": "text",
|
| 34 |
+
"chunk_id": chunk_id,
|
| 35 |
+
"start_char_index": start_char_idx, # Vị trí ký tự bắt đầu
|
| 36 |
+
"end_char_index": start_char_idx + len(chunk_content), # Vị trí ký tự kết thúc
|
| 37 |
+
"content_length": len(chunk_content)
|
| 38 |
+
}
|
| 39 |
+
chunks.append({
|
| 40 |
+
"content": chunk_content,
|
| 41 |
+
"metadata": metadata
|
| 42 |
+
})
|
| 43 |
+
logger.info(f"Generated {len(chunks)} text chunks from {file_path}")
|
| 44 |
+
return chunks
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Error processing text document {file_path}: {e}")
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
# Ví dụ sử dụng (giữ nguyên để kiểm tra)
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
from config.settings import settings
|
| 52 |
+
import os
|
| 53 |
+
|
| 54 |
+
sample_doc_path = os.path.join(settings.RAW_DATA_DIR, "documents", "sample_document.txt")
|
| 55 |
+
if not os.path.exists(sample_doc_path):
|
| 56 |
+
print(f"ERROR: Sample document not found at {sample_doc_path}. Please create it first.")
|
| 57 |
+
else:
|
| 58 |
+
processor = TextProcessor(chunk_size=100, chunk_overlap=20) # Thử kích thước nhỏ hơn để thấy rõ chunk
|
| 59 |
+
text_chunks = processor.process(sample_doc_path)
|
| 60 |
+
|
| 61 |
+
for i, chunk in enumerate(text_chunks): # In tất cả các chunk để kiểm tra
|
| 62 |
+
print(f"\n--- Chunk {i+1} ---")
|
| 63 |
+
print(f"Content: {chunk['content']}") # In toàn bộ nội dung chunk
|
| 64 |
+
print(f"Metadata: {chunk['metadata']}")
|
core/data_processing/video_processor.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/data_processing/video_processor.py
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
import cv2
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
from utils.logger import logger
|
| 9 |
+
from moviepy.editor import VideoFileClip
|
| 10 |
+
from config.settings import settings
|
| 11 |
+
|
| 12 |
+
class VideoProcessor:
|
| 13 |
+
def __init__(self, chunk_duration_sec: int = 10, frames_per_segment: int = 5):
|
| 14 |
+
self.chunk_duration_sec = chunk_duration_sec
|
| 15 |
+
self.frames_per_segment = frames_per_segment
|
| 16 |
+
logger.info(f"VideoProcessor initialized (chunk_duration={chunk_duration_sec}s, frames_per_segment={frames_per_segment}).")
|
| 17 |
+
|
| 18 |
+
def process_video(self, file_path: str) -> List[Dict[str, Any]]:
|
| 19 |
+
try:
|
| 20 |
+
logger.info(f"Processing video file: {file_path}")
|
| 21 |
+
video_clip = VideoFileClip(file_path)
|
| 22 |
+
total_duration = video_clip.duration # Tổng thời lượng video (giây)
|
| 23 |
+
|
| 24 |
+
all_chunks = []
|
| 25 |
+
|
| 26 |
+
# Tạo thư mục con để lưu các frame/ảnh tạm thời
|
| 27 |
+
image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks", os.path.basename(file_path).split('.')[0])
|
| 28 |
+
os.makedirs(image_chunks_dir, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
# Tạo thư mục con để lưu các video segment tạm thời
|
| 31 |
+
video_segments_dir = os.path.join(settings.CHUNKS_DIR, "video/video_segments", os.path.basename(file_path).split('.')[0])
|
| 32 |
+
os.makedirs(video_segments_dir, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
current_time = 0.0
|
| 35 |
+
chunk_idx = 0
|
| 36 |
+
|
| 37 |
+
while current_time < total_duration:
|
| 38 |
+
end_time = min(current_time + self.chunk_duration_sec, total_duration) # end time of each segment
|
| 39 |
+
segment_clip = video_clip.subclip(current_time, end_time)
|
| 40 |
+
|
| 41 |
+
segment_base_name = f"{os.path.basename(file_path).split('.')[0]}_segment_{chunk_idx}"
|
| 42 |
+
|
| 43 |
+
frames_paths = []
|
| 44 |
+
|
| 45 |
+
frame_timestamps = np.linspace(0, segment_clip.duration, self.frames_per_segment + 2)[1:-1]
|
| 46 |
+
|
| 47 |
+
for ts in frame_timestamps:
|
| 48 |
+
frame = segment_clip.get_frame(ts)
|
| 49 |
+
frame_filename = f"{segment_base_name}_frame_{int(ts*1000)}.jpg"
|
| 50 |
+
frame_path = os.path.join(image_chunks_dir, frame_filename)
|
| 51 |
+
cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
| 52 |
+
frames_paths.append(frame_path)
|
| 53 |
+
|
| 54 |
+
# Tạo chunk cho các khung hình
|
| 55 |
+
image_chunk_id = f"{segment_base_name}_image"
|
| 56 |
+
all_chunks.append({
|
| 57 |
+
"content": frames_paths, # Danh sách đường dẫn đến các file ảnh
|
| 58 |
+
"metadata": {
|
| 59 |
+
"source_id": os.path.basename(file_path),
|
| 60 |
+
"type": "video_frame", # Loại chunk
|
| 61 |
+
"chunk_id": image_chunk_id,
|
| 62 |
+
"start_time_sec": current_time,
|
| 63 |
+
"end_time_sec": end_time,
|
| 64 |
+
"frame_paths": frames_paths # Lưu lại đường dẫn trong metadata
|
| 65 |
+
}
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# 2. Lưu đoạn video clip (optional, nhưng hữu ích cho video retrieval)
|
| 69 |
+
video_segment_path = os.path.join(video_segments_dir, f"{segment_base_name}.mp4")
|
| 70 |
+
segment_clip.write_videofile(video_segment_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
|
| 71 |
+
|
| 72 |
+
# Tạo chunk cho video segment
|
| 73 |
+
video_chunk_id = f"{segment_base_name}_video_clip"
|
| 74 |
+
all_chunks.append({
|
| 75 |
+
"content": video_segment_path, # Đường dẫn đến file video clip
|
| 76 |
+
"metadata": {
|
| 77 |
+
"source_id": os.path.basename(file_path),
|
| 78 |
+
"type": "video_segment_clip", # Loại chunk mới: video clip
|
| 79 |
+
"chunk_id": video_chunk_id,
|
| 80 |
+
"start_time_sec": current_time,
|
| 81 |
+
"end_time_sec": end_time,
|
| 82 |
+
"chunk_data_path": video_segment_path # Lưu lại đường dẫn trong metadata
|
| 83 |
+
}
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
current_time = end_time
|
| 87 |
+
chunk_idx += 1
|
| 88 |
+
|
| 89 |
+
video_clip.close() # Đảm bảo giải phóng tài nguyên
|
| 90 |
+
logger.info(f"Generated {len(all_chunks)} chunks (frames & video segments) from video {file_path}")
|
| 91 |
+
return all_chunks
|
| 92 |
+
except FileNotFoundError:
|
| 93 |
+
logger.error(f"Video file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
|
| 94 |
+
return []
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Error processing video file {file_path}: {e}")
|
| 97 |
+
return []
|
| 98 |
+
|
| 99 |
+
# Ví dụ sử dụng (giữ nguyên để kiểm tra)
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
sample_video_path = os.path.join(settings.RAW_DATA_DIR, "videos", "sample_video.mp4")
|
| 102 |
+
if not os.path.exists(sample_video_path):
|
| 103 |
+
print(f"ERROR: Sample video not found at {sample_video_path}. Please create it first.")
|
| 104 |
+
print("Make sure you have ffmpeg installed and available in your PATH for moviepy to work.")
|
| 105 |
+
else:
|
| 106 |
+
processor = VideoProcessor(chunk_duration_sec=5, frames_per_segment=3)
|
| 107 |
+
video_chunks = processor.process_video(sample_video_path)
|
| 108 |
+
|
| 109 |
+
for i, chunk in enumerate(video_chunks):
|
| 110 |
+
print(f"\n--- Video Chunk {i+1} ---")
|
| 111 |
+
print(f"Type: {chunk['metadata']['type']}")
|
| 112 |
+
if chunk['metadata']['type'] == 'video_frames':
|
| 113 |
+
print(f"Content (paths): {chunk['content']}")
|
| 114 |
+
if chunk['content']:
|
| 115 |
+
print(f"Sample frame: {chunk['content'][0]}")
|
| 116 |
+
elif chunk['metadata']['type'] == 'video_segment_clip':
|
| 117 |
+
print(f"Content (path): {chunk['content']}")
|
| 118 |
+
print(f"Metadata: {chunk['metadata']}")
|
core/embeddings/audio_embedding_model.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# models/embeddings/audio_embedding_model.py
|
| 2 |
+
import torch
|
| 3 |
+
import librosa
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from typing import List
|
| 7 |
+
from transformers import AutoProcessor, AutoModel
|
| 8 |
+
from utils.logger import logger
|
| 9 |
+
from config.model_configs import AUDIO_EMBEDDING_MODEL
|
| 10 |
+
|
| 11 |
+
class AudioEmbeddingModel:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 14 |
+
logger.info(f"Loading Audio Embedding Model '{AUDIO_EMBEDDING_MODEL}' to device: {self.device}")
|
| 15 |
+
|
| 16 |
+
self.processor = AutoProcessor.from_pretrained(AUDIO_EMBEDDING_MODEL)
|
| 17 |
+
self.model = AutoModel.from_pretrained(AUDIO_EMBEDDING_MODEL).to(self.device)
|
| 18 |
+
logger.info("Audio Embedding Model loaded successfully.")
|
| 19 |
+
|
| 20 |
+
def get_embeddings(self, audio_paths: List[str]) -> List[List[float]]:
|
| 21 |
+
if not audio_paths:
|
| 22 |
+
return []
|
| 23 |
+
|
| 24 |
+
audio_inputs = []
|
| 25 |
+
sample_rate = self.processor.feature_extractor.sampling_rate
|
| 26 |
+
|
| 27 |
+
for audio_path in audio_paths:
|
| 28 |
+
try:
|
| 29 |
+
audio_data, sr = librosa.load(audio_path, sr=sample_rate)
|
| 30 |
+
audio_inputs.append(audio_data)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.warning(f"Could not load audio {audio_path}: {e}. Skipping.")
|
| 33 |
+
continue
|
| 34 |
+
|
| 35 |
+
if not audio_inputs:
|
| 36 |
+
return []
|
| 37 |
+
|
| 38 |
+
inputs = self.processor(audios=audio_inputs, sampling_rate=sample_rate, return_tensors="pt", padding=True).to(self.device)
|
| 39 |
+
|
| 40 |
+
with torch.no_grad():
|
| 41 |
+
audio_features = self.model.get_audio_features(**inputs)
|
| 42 |
+
|
| 43 |
+
embeddings = audio_features / audio_features.norm(p=2, dim=-1, keepdim=True)
|
| 44 |
+
|
| 45 |
+
embeddings_list = embeddings.cpu().tolist()
|
| 46 |
+
logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
|
| 47 |
+
return embeddings_list
|
| 48 |
+
|
| 49 |
+
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
from config.settings import settings
|
| 52 |
+
import os
|
| 53 |
+
|
| 54 |
+
model = AudioEmbeddingModel()
|
| 55 |
+
sample_audio_dir = os.path.join(settings.PROCESSED_DATA_DIR, "audio_segments", "sample_audio") # Giả sử có thư mục audio từ file mẫu
|
| 56 |
+
|
| 57 |
+
# Tạo một audio dummy nếu không có file audio mẫu
|
| 58 |
+
if not os.path.exists(sample_audio_dir) or not os.listdir(sample_audio_dir):
|
| 59 |
+
print(f"Creating a dummy audio for testing at {sample_audio_dir}...")
|
| 60 |
+
os.makedirs(sample_audio_dir, exist_ok=True)
|
| 61 |
+
from pydub import AudioSegment
|
| 62 |
+
dummy_audio = AudioSegment.silent(duration=1000) # 1 giây im lặng
|
| 63 |
+
dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
|
| 64 |
+
dummy_audio.export(dummy_audio_path, format="wav")
|
| 65 |
+
sample_audio_paths = [dummy_audio_path]
|
| 66 |
+
else:
|
| 67 |
+
sample_audio_paths = [os.path.join(sample_audio_dir, f) for f in os.listdir(sample_audio_dir) if f.endswith(('.wav', '.mp3'))]
|
| 68 |
+
if not sample_audio_paths:
|
| 69 |
+
print(f"No audio files found in {sample_audio_dir}. Please ensure sample audio was processed.")
|
| 70 |
+
from pydub import AudioSegment
|
| 71 |
+
dummy_audio = AudioSegment.silent(duration=1000)
|
| 72 |
+
dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
|
| 73 |
+
dummy_audio.export(dummy_audio_path, format="wav")
|
| 74 |
+
sample_audio_paths = [dummy_audio_path]
|
| 75 |
+
|
| 76 |
+
print(f"Using {len(sample_audio_paths)} sample audio clips: {sample_audio_paths[:2]}...")
|
| 77 |
+
embeddings = model.get_embeddings(sample_audio_paths)
|
| 78 |
+
|
| 79 |
+
print(f"Number of embeddings: {len(embeddings)}")
|
| 80 |
+
if embeddings:
|
| 81 |
+
print(f"Dimension of embeddings: {len(embeddings[0])}")
|
| 82 |
+
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
| 83 |
+
if len(embeddings) > 1:
|
| 84 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 85 |
+
sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
| 86 |
+
print(f"Similarity between audio 1 and 2: {sim:.4f}")
|
core/embeddings/image_embedding_model.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 6 |
+
from utils.logger import logger
|
| 7 |
+
from config.model_configs import IMAGE_EMBEDDING_MODEL
|
| 8 |
+
|
| 9 |
+
class ImageEmbeddingModel:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
+
logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
|
| 13 |
+
|
| 14 |
+
self.model = CLIPModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
|
| 15 |
+
self.processor = CLIPProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
|
| 16 |
+
logger.info("Image Embedding Model loaded successfully.")
|
| 17 |
+
|
| 18 |
+
def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
|
| 19 |
+
if not image_paths:
|
| 20 |
+
return []
|
| 21 |
+
|
| 22 |
+
images = []
|
| 23 |
+
for img_path in image_paths:
|
| 24 |
+
try:
|
| 25 |
+
images.append(Image.open(img_path).convert("RGB"))
|
| 26 |
+
except Exception as e:
|
| 27 |
+
logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
|
| 28 |
+
continue
|
| 29 |
+
|
| 30 |
+
if not images:
|
| 31 |
+
return []
|
| 32 |
+
|
| 33 |
+
inputs = self.processor(images=images, return_tensors="pt").to(self.device)
|
| 34 |
+
|
| 35 |
+
with torch.no_grad():
|
| 36 |
+
image_features = self.model.get_image_features(pixel_values=inputs.pixel_values)
|
| 37 |
+
|
| 38 |
+
embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
|
| 39 |
+
|
| 40 |
+
embeddings_list = embeddings.cpu().tolist()
|
| 41 |
+
logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
|
| 42 |
+
return embeddings_list
|
| 43 |
+
|
| 44 |
+
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
from config.settings import settings
|
| 47 |
+
import os
|
| 48 |
+
|
| 49 |
+
model = ImageEmbeddingModel()
|
| 50 |
+
sample_image_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks/sample_video") # Giả sử có thư mục ảnh từ video mẫu
|
| 51 |
+
|
| 52 |
+
# Tạo một ảnh dummy nếu không có ảnh mẫu
|
| 53 |
+
if not os.path.exists(sample_image_dir) or not os.listdir(sample_image_dir):
|
| 54 |
+
print(f"Creating a dummy image for testing at {sample_image_dir}...")
|
| 55 |
+
os.makedirs(sample_image_dir, exist_ok=True)
|
| 56 |
+
dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
|
| 57 |
+
dummy_img = Image.new('RGB', (60, 30), color = 'red')
|
| 58 |
+
dummy_img.save(dummy_image_path)
|
| 59 |
+
sample_image_paths = [dummy_image_path]
|
| 60 |
+
else:
|
| 61 |
+
sample_image_paths = [os.path.join(sample_image_dir, f) for f in os.listdir(sample_image_dir) if f.endswith(('.jpg', '.png'))]
|
| 62 |
+
if not sample_image_paths: # Nếu thư mục có nhưng không có ảnh
|
| 63 |
+
print(f"No images found in {sample_image_dir}. Please ensure sample video was processed.")
|
| 64 |
+
dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
|
| 65 |
+
dummy_img = Image.new('RGB', (60, 30), color = 'red')
|
| 66 |
+
dummy_img.save(dummy_image_path)
|
| 67 |
+
sample_image_paths = [dummy_image_path]
|
| 68 |
+
|
| 69 |
+
print(f"Using {len(sample_image_paths)} sample images: {sample_image_paths[:2]}...")
|
| 70 |
+
embeddings = model.get_embeddings(sample_image_paths)
|
| 71 |
+
|
| 72 |
+
print(f"Number of embeddings: {len(embeddings)}")
|
| 73 |
+
if embeddings:
|
| 74 |
+
print(f"Dimension of embeddings: {len(embeddings[0])}")
|
| 75 |
+
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
| 76 |
+
# Nếu có đủ ảnh, thử so sánh 2 ảnh đầu
|
| 77 |
+
if len(embeddings) > 1:
|
| 78 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 79 |
+
sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
| 80 |
+
print(f"Similarity between image 1 and 2: {sim:.4f}")
|
core/embeddings/text_embedding_model.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from utils.logger import logger
|
| 6 |
+
from config.model_configs import TEXT_EMBEDDING_MODEL
|
| 7 |
+
|
| 8 |
+
class TextEmbeddingModel:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
+
logger.info(f"Loading Text Embedding Model '{TEXT_EMBEDDING_MODEL}' to device: {self.device}")
|
| 12 |
+
|
| 13 |
+
self.model = SentenceTransformer(TEXT_EMBEDDING_MODEL, device=self.device)
|
| 14 |
+
logger.info("Text Embedding Model loaded successfully.")
|
| 15 |
+
|
| 16 |
+
def get_embeddings(self, texts: List[str]) -> List[List[float]]:
|
| 17 |
+
if not texts:
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
|
| 21 |
+
logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
|
| 22 |
+
return embeddings
|
| 23 |
+
|
| 24 |
+
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
model = TextEmbeddingModel()
|
| 27 |
+
sample_texts = [
|
| 28 |
+
"This is a test sentence.",
|
| 29 |
+
"Another sentence for embedding.",
|
| 30 |
+
"How about some natural language processing?",
|
| 31 |
+
"Xe hơi màu đỏ đang chạy trên đường phố." # Thử với tiếng Việt
|
| 32 |
+
]
|
| 33 |
+
embeddings = model.get_embeddings(sample_texts)
|
| 34 |
+
|
| 35 |
+
print(f"Number of embeddings: {len(embeddings)}")
|
| 36 |
+
if embeddings:
|
| 37 |
+
print(f"Dimension of embeddings: {len(embeddings[0])}")
|
| 38 |
+
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
| 39 |
+
# Bạn có thể thử tính cosine similarity giữa các embedding ở đây để thấy độ tương đồng
|
| 40 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 41 |
+
sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
| 42 |
+
print(f"Similarity between text 1 and 2: {sim:.4f}")
|
core/retrieval/retriever.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/retrieval/retriever.py
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
from typing import List, Tuple, Dict, Any, Union
|
| 5 |
+
from utils.logger import logger
|
| 6 |
+
from config.settings import settings
|
| 7 |
+
from qdrant_client import QdrantClient
|
| 8 |
+
|
| 9 |
+
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
| 10 |
+
from core.embeddings.image_embedding_model import ImageEmbeddingModel
|
| 11 |
+
from core.embeddings.audio_embedding_model import AudioEmbeddingModel
|
| 12 |
+
|
| 13 |
+
from core.retrieval.vector_db_manager import VectorDBManager
|
| 14 |
+
|
| 15 |
+
class Retriever:
|
| 16 |
+
def __init__(self, client: QdrantClient):
|
| 17 |
+
logger.info("Initializing the Retriever...")
|
| 18 |
+
|
| 19 |
+
# Initialize embedding models
|
| 20 |
+
self.text_embedder = TextEmbeddingModel()
|
| 21 |
+
self.image_embedder = ImageEmbeddingModel()
|
| 22 |
+
self.audio_embedder = AudioEmbeddingModel()
|
| 23 |
+
logger.info("Embedding models initialized.")
|
| 24 |
+
|
| 25 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 26 |
+
self.client = client
|
| 27 |
+
logger.info(f"Single Qdrant client initialized, connected to: {qdrant_db_path}")
|
| 28 |
+
|
| 29 |
+
# Initialize vector database
|
| 30 |
+
text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
|
| 31 |
+
self.text_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_dim, client=self.client)
|
| 32 |
+
|
| 33 |
+
image_dim = 512
|
| 34 |
+
self.image_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_dim, client=self.client)
|
| 35 |
+
|
| 36 |
+
audio_dim = 512
|
| 37 |
+
self.audio_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=audio_dim, client=self.client)
|
| 38 |
+
|
| 39 |
+
logger.info("VectorDB Managers connected to Qdrant collections.")
|
| 40 |
+
logger.info(f"Text collection ('{self.text_db_manager.collection_name}') contains {self.text_db_manager.get_total_vectors()} vectors.")
|
| 41 |
+
logger.info(f"Image collection ('{self.image_db_manager.collection_name}') contains {self.image_db_manager.get_total_vectors()} vectors.")
|
| 42 |
+
logger.info(f"Audio collection ('{self.audio_db_manager.collection_name}') contains {self.audio_db_manager.get_total_vectors()} vectors.")
|
| 43 |
+
|
| 44 |
+
def retrieve(self, query: Union[str, bytes], query_type: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 45 |
+
logger.info(f"Received retrieval request. Query type: '{query_type}', Top K: {top_k}")
|
| 46 |
+
|
| 47 |
+
embedding = None
|
| 48 |
+
db_manager_to_use = None
|
| 49 |
+
|
| 50 |
+
# create embeddings
|
| 51 |
+
try:
|
| 52 |
+
if query_type == "text":
|
| 53 |
+
if not isinstance(query, str):
|
| 54 |
+
raise TypeError("Text query must be a string.")
|
| 55 |
+
embedding = self.text_embedder.get_embeddings(query)
|
| 56 |
+
db_manager_to_use = self.text_db_manager
|
| 57 |
+
elif query_type == "image":
|
| 58 |
+
if not isinstance(query, str) or not os.path.exists(query):
|
| 59 |
+
raise TypeError("Image query must be a valid file path.")
|
| 60 |
+
embedding = self.image_embedder.get_embeddings([query])[0]
|
| 61 |
+
db_manager_to_use = self.image_db_manager
|
| 62 |
+
elif query_type == "audio":
|
| 63 |
+
if not isinstance(query, str) or not os.path.exists(query):
|
| 64 |
+
raise TypeError("Audio query must be a valid file path.")
|
| 65 |
+
embedding = self.audio_embedder.get_embeddings([query])[0]
|
| 66 |
+
db_manager_to_use = self.audio_db_manager
|
| 67 |
+
else:
|
| 68 |
+
logger.error(f"Unsupported query type: {query_type}")
|
| 69 |
+
return []
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Error generating embedding for query: {e}")
|
| 72 |
+
return []
|
| 73 |
+
|
| 74 |
+
if embedding is None:
|
| 75 |
+
logger.warning("Could not generate embedding for the query.")
|
| 76 |
+
return []
|
| 77 |
+
|
| 78 |
+
# searching vectors
|
| 79 |
+
try:
|
| 80 |
+
search_results = db_manager_to_use.search_vectors(embedding, k=top_k)
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Error searching in vector database: {e}")
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
formatted_results = []
|
| 86 |
+
for score, payload in search_results:
|
| 87 |
+
formatted_results.append({
|
| 88 |
+
"score": score,
|
| 89 |
+
"metadata": payload['metadata'],
|
| 90 |
+
"content": payload['content']
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
|
| 94 |
+
return formatted_results
|
| 95 |
+
|
| 96 |
+
# def _get_content_from_payload(self, payload: Dict):
|
| 97 |
+
# chunk_type = payload.get("type")
|
| 98 |
+
# if chunk_type == "text":
|
| 99 |
+
# return None # Sẽ cải thiện sau
|
| 100 |
+
# elif chunk_type == 'image' or chunk_type == "audio":
|
| 101 |
+
# return payload.get('chunk_data_path') # Trả về đường dẫn
|
| 102 |
+
# return None
|
| 103 |
+
|
| 104 |
+
def is_database_empty(self) -> bool:
|
| 105 |
+
total_vectors = self.text_db_manager.get_total_vectors() \
|
| 106 |
+
+ self.image_db_manager.get_total_vectors() \
|
| 107 |
+
+ self.audio_db_manager.get_total_vectors()
|
| 108 |
+
|
| 109 |
+
return total_vectors == 0
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
from config.settings import settings
|
| 113 |
+
|
| 114 |
+
logger.info("--- Running Retriever Standalone Test (Qdrant version) ---")
|
| 115 |
+
|
| 116 |
+
# Kiểm tra xem Qdrant đã có dữ liệu chưa
|
| 117 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 118 |
+
if not os.path.exists(qdrant_db_path):
|
| 119 |
+
print("\n\nERROR: Qdrant database not found. Please run 'python scripts/ingest_data.py' first to create the database.\n\n")
|
| 120 |
+
else:
|
| 121 |
+
retriever = Retriever()
|
| 122 |
+
|
| 123 |
+
# --- 1. Thử truy vấn văn bản ---
|
| 124 |
+
print("\n--- Testing Text Retrieval ---")
|
| 125 |
+
text_query = "What is artificial intelligence?"
|
| 126 |
+
text_results = retriever.retrieve(text_query, query_type="text", top_k=3)
|
| 127 |
+
print(f"Query: '{text_query}'")
|
| 128 |
+
for i, result in enumerate(text_results):
|
| 129 |
+
print(f" Result {i+1}:")
|
| 130 |
+
print(f" Score: {result['score']:.4f}")
|
| 131 |
+
print(f" Type: {result['metadata']['type']}")
|
| 132 |
+
print(f" Content Preview: {str(result.get('content'))[:200] if result.get('content') else 'N/A'}...")
|
| 133 |
+
print(f" Source: {result['metadata']['source_id']}")
|
| 134 |
+
|
| 135 |
+
# --- 2. Thử truy vấn hình ảnh ---
|
| 136 |
+
print("\n--- Testing Image Retrieval ---")
|
| 137 |
+
# Lấy một ảnh từ các chunk đã xử lý để làm truy vấn
|
| 138 |
+
image_to_query = None
|
| 139 |
+
image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks")
|
| 140 |
+
if os.path.exists(image_chunks_dir):
|
| 141 |
+
for root, _, files in os.walk(image_chunks_dir):
|
| 142 |
+
if files:
|
| 143 |
+
image_to_query = os.path.join(root, files[0])
|
| 144 |
+
break
|
| 145 |
+
|
| 146 |
+
if image_to_query and os.path.exists(image_to_query):
|
| 147 |
+
print(f"Using image as query: {image_to_query}")
|
| 148 |
+
image_results = retriever.retrieve(image_to_query, query_type="image", top_k=3)
|
| 149 |
+
for i, result in enumerate(image_results):
|
| 150 |
+
print(f" Result {i+1}:")
|
| 151 |
+
print(f" Score: {result['score']:.4f}")
|
| 152 |
+
print(f" Type: {result['metadata']['type']}")
|
| 153 |
+
print(f" Content (Paths): {result['content']}")
|
| 154 |
+
print(f" Source: {result['metadata']['source_id']}")
|
| 155 |
+
else:
|
| 156 |
+
print("Could not find a sample image to test image retrieval.")
|
core/retrieval/vector_db_manager.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple, Dict, Any
|
| 4 |
+
from uuid import uuid4
|
| 5 |
+
|
| 6 |
+
from qdrant_client import QdrantClient, models
|
| 7 |
+
from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
|
| 8 |
+
|
| 9 |
+
from utils.logger import logger
|
| 10 |
+
from config.settings import settings
|
| 11 |
+
|
| 12 |
+
class VectorDBManager:
|
| 13 |
+
def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
|
| 14 |
+
logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
|
| 15 |
+
|
| 16 |
+
if client:
|
| 17 |
+
self.client = client
|
| 18 |
+
logger.info("Using shared Qdrant client instance.")
|
| 19 |
+
else:
|
| 20 |
+
logger.warning("No shared Qdrant client provided. Creating a new local instance.")
|
| 21 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 22 |
+
self.client = QdrantClient(path=qdrant_db_path)
|
| 23 |
+
|
| 24 |
+
self.collection_name = collection_name
|
| 25 |
+
self.embedding_dim = embedding_dim
|
| 26 |
+
|
| 27 |
+
self.create_collection_if_not_exists()
|
| 28 |
+
|
| 29 |
+
def create_collection_if_not_exists(self):
|
| 30 |
+
try:
|
| 31 |
+
collections = self.client.get_collections().collections
|
| 32 |
+
collection_names = [collection.name for collection in collections]
|
| 33 |
+
|
| 34 |
+
if self.collection_name not in collection_names:
|
| 35 |
+
logger.info(f"Collection '{self.collection_name}' not found. Creating a new one...")
|
| 36 |
+
|
| 37 |
+
self.client.recreate_collection(
|
| 38 |
+
collection_name=self.collection_name,
|
| 39 |
+
vectors_config=VectorParams(
|
| 40 |
+
size=self.embedding_dim,
|
| 41 |
+
distance=Distance.COSINE
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
logger.success(f"Collection '{self.collection_name}' created successfully.")
|
| 45 |
+
else:
|
| 46 |
+
logger.info(f"Collection '{self.collection_name}' already exists.")
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Error checking or creating collection '{self.collection_name}': {e}")
|
| 50 |
+
raise
|
| 51 |
+
|
| 52 |
+
def add_vectors(self, embeddings: List[List[float]], metadatas: List[Dict[str, Any]]):
|
| 53 |
+
if not embeddings:
|
| 54 |
+
logger.warning("No embeddings to add. Skipping.")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
if len(embeddings) != len(metadatas):
|
| 58 |
+
logger.error("Number of embeddings and metadatas must match.")
|
| 59 |
+
raise ValueError("Embeddings and metadatas count mismatch.")
|
| 60 |
+
|
| 61 |
+
points_to_add = []
|
| 62 |
+
for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)):
|
| 63 |
+
point_id = str(uuid4())
|
| 64 |
+
|
| 65 |
+
points_to_add.append(
|
| 66 |
+
PointStruct(
|
| 67 |
+
id=point_id,
|
| 68 |
+
vector=embedding,
|
| 69 |
+
payload=metadata
|
| 70 |
+
)
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
operation_info = self.client.upsert(
|
| 75 |
+
collection_name=self.collection_name,
|
| 76 |
+
wait=True,
|
| 77 |
+
points=points_to_add
|
| 78 |
+
)
|
| 79 |
+
if operation_info.status == UpdateStatus.COMPLETED:
|
| 80 |
+
logger.debug(f"Successfully upserted {len(points_to_add)} points to collection '{self.collection_name}'.")
|
| 81 |
+
else:
|
| 82 |
+
logger.warning(f"Upsert operation finished with status: {operation_info.status}")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Error upserting points to collection '{self.collection_name}': {e}")
|
| 85 |
+
|
| 86 |
+
def search_vectors(self, query_embedding: List[float], k: int = 5, filter_payload: Dict = None) -> List[Tuple[float, Dict[str, Any]]]:
|
| 87 |
+
try:
|
| 88 |
+
search_results = self.client.search(
|
| 89 |
+
collection_name=self.collection_name,
|
| 90 |
+
query_vector=query_embedding,
|
| 91 |
+
query_filter=filter_payload,
|
| 92 |
+
limit=k,
|
| 93 |
+
with_payload=True, # include payload in return
|
| 94 |
+
with_vectors=False # exclude vectors in return
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
formatted_results = []
|
| 98 |
+
for scored_point in search_results:
|
| 99 |
+
score = scored_point.score
|
| 100 |
+
payload = scored_point.payload
|
| 101 |
+
formatted_results.append((score, payload))
|
| 102 |
+
|
| 103 |
+
logger.debug(f"Searched for top {k} neighbors. Found {len(formatted_results)} results.")
|
| 104 |
+
return formatted_results
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.error(f"Error searching in collection '{self.collection_name}': {e}")
|
| 107 |
+
return []
|
| 108 |
+
|
| 109 |
+
def get_total_vectors(self) -> int:
|
| 110 |
+
try:
|
| 111 |
+
count_result = self.client.count(
|
| 112 |
+
collection_name=self.collection_name,
|
| 113 |
+
exact=True # Đếm chính xác
|
| 114 |
+
)
|
| 115 |
+
return count_result.count
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
|
| 118 |
+
return 0
|
| 119 |
+
|
| 120 |
+
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
import numpy as np
|
| 123 |
+
|
| 124 |
+
# Các thông số cho collection test
|
| 125 |
+
TEST_COLLECTION_NAME = "my_test_collection"
|
| 126 |
+
DUMMY_DIM = 128
|
| 127 |
+
|
| 128 |
+
# --- Kiểm tra tạo collection ---
|
| 129 |
+
print("\n--- Testing Collection Creation ---")
|
| 130 |
+
db_manager = VectorDBManager(collection_name=TEST_COLLECTION_NAME, embedding_dim=DUMMY_DIM)
|
| 131 |
+
print(f"Total vectors initially: {db_manager.get_total_vectors()}")
|
| 132 |
+
|
| 133 |
+
# --- Kiểm tra thêm vector và payload ---
|
| 134 |
+
print("\n--- Testing Add Vectors ---")
|
| 135 |
+
dummy_embeddings = np.random.rand(10, DUMMY_DIM).tolist()
|
| 136 |
+
dummy_metadatas = [
|
| 137 |
+
{"chunk_id": f"dummy_chunk_{i}", "type": "text" if i < 5 else "image", "source_file": "test.txt"}
|
| 138 |
+
for i in range(10)
|
| 139 |
+
]
|
| 140 |
+
db_manager.add_vectors(dummy_embeddings, dummy_metadatas)
|
| 141 |
+
print(f"Total vectors after adding: {db_manager.get_total_vectors()}")
|
| 142 |
+
|
| 143 |
+
# --- Kiểm tra tìm kiếm ---
|
| 144 |
+
print("\n--- Testing Search ---")
|
| 145 |
+
dummy_query = np.random.rand(DUMMY_DIM).tolist()
|
| 146 |
+
results = db_manager.search_vectors(dummy_query, top_k=3)
|
| 147 |
+
print(f"Top 3 results (no filter):")
|
| 148 |
+
for score, payload in results:
|
| 149 |
+
print(f" Score: {score:.4f}, Payload: {payload}")
|
| 150 |
+
|
| 151 |
+
# --- Kiểm tra tìm kiếm CÓ LỌC (Pre-filtering) ---
|
| 152 |
+
print("\n--- Testing Search with Filter ---")
|
| 153 |
+
filter_condition = models.Filter(
|
| 154 |
+
must=[
|
| 155 |
+
models.FieldCondition(
|
| 156 |
+
key="type", # Lọc theo trường 'type' trong payload
|
| 157 |
+
match=models.MatchValue(value="image"), # Giá trị phải là 'image'
|
| 158 |
+
)
|
| 159 |
+
]
|
| 160 |
+
)
|
| 161 |
+
filtered_results = db_manager.search_vectors(dummy_query, top_k=3, filter_payload=filter_condition)
|
| 162 |
+
print(f"Top 3 results (filtered for type='image'):")
|
| 163 |
+
for score, payload in filtered_results:
|
| 164 |
+
print(f" Score: {score:.4f}, Payload: {payload}")
|
| 165 |
+
|
| 166 |
+
# --- Dọn dẹp collection test ---
|
| 167 |
+
print("\n--- Cleaning up test collection ---")
|
| 168 |
+
db_manager.client.delete_collection(collection_name=TEST_COLLECTION_NAME)
|
| 169 |
+
print(f"Collection '{TEST_COLLECTION_NAME}' deleted.")
|
requirements.txt
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==23.2.1
|
| 2 |
+
altair==5.5.0
|
| 3 |
+
annotated-types==0.7.0
|
| 4 |
+
anyio==4.9.0
|
| 5 |
+
attrs==25.3.0
|
| 6 |
+
audioread==3.0.1
|
| 7 |
+
certifi==2025.7.14
|
| 8 |
+
cffi==1.17.1
|
| 9 |
+
charset-normalizer==3.4.2
|
| 10 |
+
click==8.2.1
|
| 11 |
+
colorama==0.4.6
|
| 12 |
+
contourpy==1.3.3
|
| 13 |
+
cycler==0.12.1
|
| 14 |
+
decorator==4.4.2
|
| 15 |
+
faiss-cpu==1.11.0.post1
|
| 16 |
+
fastapi==0.116.1
|
| 17 |
+
ffmpy==0.6.1
|
| 18 |
+
filelock==3.13.1
|
| 19 |
+
fonttools==4.59.0
|
| 20 |
+
fsspec==2024.6.1
|
| 21 |
+
gradio==5.6.0
|
| 22 |
+
gradio_client==1.4.3
|
| 23 |
+
grpcio==1.74.0
|
| 24 |
+
grpcio-tools==1.74.0
|
| 25 |
+
h11==0.16.0
|
| 26 |
+
h2==4.2.0
|
| 27 |
+
hpack==4.1.0
|
| 28 |
+
httpcore==1.0.9
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
huggingface-hub==0.34.3
|
| 31 |
+
hyperframe==6.1.0
|
| 32 |
+
idna==3.10
|
| 33 |
+
imageio==2.37.0
|
| 34 |
+
imageio-ffmpeg==0.6.0
|
| 35 |
+
importlib_resources==6.5.2
|
| 36 |
+
inquirerpy==0.3.4
|
| 37 |
+
Jinja2==3.1.4
|
| 38 |
+
joblib==1.5.1
|
| 39 |
+
jsonpatch==1.33
|
| 40 |
+
jsonpointer==3.0.0
|
| 41 |
+
jsonschema==4.25.0
|
| 42 |
+
jsonschema-specifications==2025.4.1
|
| 43 |
+
kiwisolver==1.4.8
|
| 44 |
+
langchain-core==0.2.43
|
| 45 |
+
langchain-text-splitters==0.2.0
|
| 46 |
+
langsmith==0.1.147
|
| 47 |
+
lazy_loader==0.4
|
| 48 |
+
librosa==0.10.2
|
| 49 |
+
llvmlite==0.44.0
|
| 50 |
+
loguru==0.7.2
|
| 51 |
+
markdown-it-py==3.0.0
|
| 52 |
+
MarkupSafe==2.1.5
|
| 53 |
+
matplotlib==3.10.5
|
| 54 |
+
mdurl==0.1.2
|
| 55 |
+
moviepy==1.0.3
|
| 56 |
+
mpmath==1.3.0
|
| 57 |
+
msgpack==1.1.1
|
| 58 |
+
narwhals==2.0.1
|
| 59 |
+
networkx==3.3
|
| 60 |
+
numba==0.61.2
|
| 61 |
+
numpy==1.26.4
|
| 62 |
+
opencv-python==4.12.0.88
|
| 63 |
+
orjson==3.11.1
|
| 64 |
+
packaging==24.2
|
| 65 |
+
pandas==2.3.1
|
| 66 |
+
pfzy==0.3.4
|
| 67 |
+
pillow==10.4.0
|
| 68 |
+
platformdirs==4.3.8
|
| 69 |
+
pooch==1.8.2
|
| 70 |
+
portalocker==2.10.1
|
| 71 |
+
proglog==0.1.12
|
| 72 |
+
prompt_toolkit==3.0.51
|
| 73 |
+
protobuf==6.31.1
|
| 74 |
+
pycparser==2.22
|
| 75 |
+
pydantic==2.10.6
|
| 76 |
+
pydantic-settings==2.3.4
|
| 77 |
+
pydantic_core==2.27.2
|
| 78 |
+
pydub==0.25.1
|
| 79 |
+
Pygments==2.19.2
|
| 80 |
+
pyparsing==3.2.3
|
| 81 |
+
python-dateutil==2.9.0.post0
|
| 82 |
+
python-dotenv==1.1.1
|
| 83 |
+
python-multipart==0.0.12
|
| 84 |
+
pytz==2025.2
|
| 85 |
+
pywin32==311
|
| 86 |
+
PyYAML==6.0.2
|
| 87 |
+
qdrant-client==1.9.0
|
| 88 |
+
referencing==0.36.2
|
| 89 |
+
regex==2024.11.6
|
| 90 |
+
requests==2.32.4
|
| 91 |
+
requests-toolbelt==1.0.0
|
| 92 |
+
rich==14.1.0
|
| 93 |
+
rpds-py==0.26.0
|
| 94 |
+
ruff==0.12.7
|
| 95 |
+
safehttpx==0.1.6
|
| 96 |
+
safetensors==0.5.3
|
| 97 |
+
scikit-learn==1.7.1
|
| 98 |
+
scipy==1.16.1
|
| 99 |
+
semantic-version==2.10.0
|
| 100 |
+
sentence-transformers==5.0.0
|
| 101 |
+
shellingham==1.5.4
|
| 102 |
+
six==1.17.0
|
| 103 |
+
sniffio==1.3.1
|
| 104 |
+
soundfile==0.13.1
|
| 105 |
+
soxr==0.5.0.post1
|
| 106 |
+
starlette==0.47.2
|
| 107 |
+
sympy==1.13.1
|
| 108 |
+
tenacity==8.5.0
|
| 109 |
+
threadpoolctl==3.6.0
|
| 110 |
+
tokenizers==0.19.1
|
| 111 |
+
tomlkit==0.12.0
|
| 112 |
+
torch==2.6.0+cu124
|
| 113 |
+
torchaudio==2.6.0+cu124
|
| 114 |
+
torchvision==0.21.0+cu124
|
| 115 |
+
tqdm==4.67.1
|
| 116 |
+
transformers==4.41.2
|
| 117 |
+
typer==0.16.0
|
| 118 |
+
typing-inspection==0.4.1
|
| 119 |
+
typing_extensions==4.12.2
|
| 120 |
+
tzdata==2025.2
|
| 121 |
+
urllib3==2.5.0
|
| 122 |
+
uvicorn==0.35.0
|
| 123 |
+
wcwidth==0.2.13
|
| 124 |
+
websockets==11.0.3
|
| 125 |
+
win32_setctime==1.2.0
|
scripts/ingest_data.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/ingest_data.py
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Dict
|
| 7 |
+
import shutil
|
| 8 |
+
|
| 9 |
+
from config.settings import settings
|
| 10 |
+
from utils.logger import logger
|
| 11 |
+
from qdrant_client import QdrantClient
|
| 12 |
+
|
| 13 |
+
# Import các Processor (không thay đổi)
|
| 14 |
+
from core.data_processing.text_processor import TextProcessor
|
| 15 |
+
from core.data_processing.audio_processor import AudioProcessor
|
| 16 |
+
# from core.data_processing.video_processor import VideoProcessor
|
| 17 |
+
from core.data_processing.image_processor import ImageProcessor
|
| 18 |
+
|
| 19 |
+
# Import các Embedding Model (không thay đổi)
|
| 20 |
+
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
| 21 |
+
from core.embeddings.image_embedding_model import ImageEmbeddingModel
|
| 22 |
+
from core.embeddings.audio_embedding_model import AudioEmbeddingModel
|
| 23 |
+
|
| 24 |
+
# Import VectorDBManager phiên bản Qdrant MỚI
|
| 25 |
+
from core.retrieval.vector_db_manager import VectorDBManager
|
| 26 |
+
|
| 27 |
+
def walk_through_files(extentions: Dict, raw_dir: str, all_raw_chunks_from_processors, processor):
|
| 28 |
+
all_files = list(raw_dir.rglob("*"))
|
| 29 |
+
for filepath in tqdm(all_files, desc="Processing " + raw_dir.name):
|
| 30 |
+
if filepath.suffix in extentions and filepath.is_file():
|
| 31 |
+
all_raw_chunks_from_processors.extend(
|
| 32 |
+
processor.process(str(filepath))
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
def ingest_data_pipeline():
|
| 36 |
+
logger.info("Starting comprehensive data ingestion pipeline (Chunking + Embedding + Qdrant Indexing)...")
|
| 37 |
+
|
| 38 |
+
# --- 1. Khởi tạo các Processor --- (Không thay đổi)
|
| 39 |
+
text_processor = TextProcessor(chunk_size=500, chunk_overlap=50)
|
| 40 |
+
audio_processor = AudioProcessor(min_silence_len=1000, silence_thresh_db=-40, target_sr=16000)
|
| 41 |
+
image_processor = ImageProcessor()
|
| 42 |
+
# video_processor = VideoProcessor(chunk_duration_sec=15, frames_per_segment=5)
|
| 43 |
+
|
| 44 |
+
# --- Dọn dẹp các thư mục chunk và Qdrant data cũ ---
|
| 45 |
+
dirs_to_clean_and_create = [
|
| 46 |
+
settings.CHUNKS_DIR,
|
| 47 |
+
settings.METADATA_DIR
|
| 48 |
+
]
|
| 49 |
+
# Thư mục dữ liệu của Qdrant
|
| 50 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 51 |
+
dirs_to_clean_and_create.append(qdrant_db_path)
|
| 52 |
+
|
| 53 |
+
for dir_path in dirs_to_clean_and_create:
|
| 54 |
+
if os.path.exists(dir_path):
|
| 55 |
+
shutil.rmtree(dir_path)
|
| 56 |
+
logger.info(f"Cleaned up old directory: {dir_path}")
|
| 57 |
+
# Tạo lại các thư mục cho chunking, trừ thư mục qdrant (client sẽ tự tạo)
|
| 58 |
+
if dir_path != qdrant_db_path:
|
| 59 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 60 |
+
|
| 61 |
+
logger.info("Output directories and previous Qdrant data are ready for fresh ingestion.")
|
| 62 |
+
|
| 63 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 64 |
+
client = QdrantClient(path=qdrant_db_path)
|
| 65 |
+
logger.info(f"Single Qdrant client initialized for ingestion, connected to: {qdrant_db_path}")
|
| 66 |
+
|
| 67 |
+
all_raw_chunks_from_processors = [] # Chứa tất cả các chunk (bao gồm content và metadata)
|
| 68 |
+
|
| 69 |
+
# --- 2. Chạy Data Processing (Chunking) --- (Không thay đổi)
|
| 70 |
+
logger.info("--- Phase 1: Processing Raw Data into Chunks ---")
|
| 71 |
+
|
| 72 |
+
# Xử lý Văn bản
|
| 73 |
+
text_extentions = {".txt"}
|
| 74 |
+
text_raw_dir = Path(settings.RAW_DATA_DIR) / "texts"
|
| 75 |
+
walk_through_files(text_extentions, text_raw_dir, all_raw_chunks_from_processors, text_processor)
|
| 76 |
+
|
| 77 |
+
# Xử lý Âm thanh
|
| 78 |
+
audio_extentions = {".wav", ".mp3"}
|
| 79 |
+
audio_raw_dir = Path(settings.RAW_DATA_DIR) / "audios"
|
| 80 |
+
walk_through_files(audio_extentions, audio_raw_dir, all_raw_chunks_from_processors, audio_processor)
|
| 81 |
+
|
| 82 |
+
# process images
|
| 83 |
+
image_extentions = {".jpg", ".png"}
|
| 84 |
+
image_raw_dir = Path(settings.RAW_DATA_DIR) / "images"
|
| 85 |
+
walk_through_files(image_extentions, image_raw_dir, all_raw_chunks_from_processors, image_processor)
|
| 86 |
+
|
| 87 |
+
# Xử lý Video
|
| 88 |
+
# video_raw_dir = os.path.join(settings.RAW_DATA_DIR, "videos")
|
| 89 |
+
# for filename in tqdm(os.listdir(video_raw_dir), desc="Processing Video"):
|
| 90 |
+
# if filename.endswith((".mp4", ".avi", ".mov")):
|
| 91 |
+
# all_raw_chunks_from_processors.extend(video_processor.process_video(os.path.join(video_raw_dir, filename)))
|
| 92 |
+
|
| 93 |
+
logger.info(f"Total raw chunks processed from all sources: {len(all_raw_chunks_from_processors)}")
|
| 94 |
+
|
| 95 |
+
# --- 3. Tạo Embedding và Thêm vào Qdrant ---
|
| 96 |
+
logger.info("--- Phase 2: Generating Embeddings and Building Qdrant Collections ---")
|
| 97 |
+
|
| 98 |
+
# Khởi tạo các Embedding Model
|
| 99 |
+
text_embedder = TextEmbeddingModel()
|
| 100 |
+
image_embedder = ImageEmbeddingModel()
|
| 101 |
+
audio_embedder = AudioEmbeddingModel()
|
| 102 |
+
|
| 103 |
+
# --- Khởi tạo các VectorDBManager cho Qdrant ---
|
| 104 |
+
# Lấy kích thước embedding từ model để đảm bảo chính xác
|
| 105 |
+
text_embedding_dim = text_embedder.model.get_sentence_embedding_dimension()
|
| 106 |
+
text_vector_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_embedding_dim, client=client)
|
| 107 |
+
|
| 108 |
+
# Kích thước embedding cho image/audio (giả định là 512)
|
| 109 |
+
image_embedding_dim = 512
|
| 110 |
+
image_vector_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_embedding_dim, client=client)
|
| 111 |
+
|
| 112 |
+
# video_frame_embedding_dim = 512
|
| 113 |
+
# video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
|
| 114 |
+
|
| 115 |
+
audio_embedding_dim = 512
|
| 116 |
+
audio_vector_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=image_embedding_dim, client=client)
|
| 117 |
+
|
| 118 |
+
logger.info(f"Initialized Text Qdrant Collection Manager with {text_embedding_dim}D.")
|
| 119 |
+
logger.info(f"Initialized Image Qdrant Collection Manager with {image_embedding_dim}D.")
|
| 120 |
+
logger.info(f"Initialized Audio Qdrant Collection Manager with {audio_embedding_dim}D.")
|
| 121 |
+
|
| 122 |
+
# Tạo các batch để thêm vào Qdrant hiệu quả hơn
|
| 123 |
+
text_embeddings_batch = []
|
| 124 |
+
text_metadatas_batch = []
|
| 125 |
+
|
| 126 |
+
image_embeddings_batch = []
|
| 127 |
+
image_metadatas_batch = []
|
| 128 |
+
|
| 129 |
+
# video_frame_embeddings_batch = []
|
| 130 |
+
# video_frame_metadatas_batch = []
|
| 131 |
+
|
| 132 |
+
audio_embeddings_batch = []
|
| 133 |
+
audio_metadatas_batch = []
|
| 134 |
+
|
| 135 |
+
BATCH_SIZE = 32 # Thêm 32 điểm một lần
|
| 136 |
+
|
| 137 |
+
for chunk_data in tqdm(all_raw_chunks_from_processors, desc="Generating Embeddings & Populating Qdrant"):
|
| 138 |
+
chunk_type = chunk_data['metadata']['type']
|
| 139 |
+
content = chunk_data['content']
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
if chunk_type == "text":
|
| 143 |
+
embedding = text_embedder.get_embeddings([content])[0]
|
| 144 |
+
text_embeddings_batch.append(embedding)
|
| 145 |
+
text_metadatas_batch.append(chunk_data)
|
| 146 |
+
|
| 147 |
+
elif chunk_type == "audio":
|
| 148 |
+
embedding = audio_embedder.get_embeddings([content])[0]
|
| 149 |
+
audio_embeddings_batch.append(embedding)
|
| 150 |
+
audio_metadatas_batch.append(chunk_data)
|
| 151 |
+
|
| 152 |
+
elif chunk_type == "image":
|
| 153 |
+
embedding = image_embedder.get_embeddings([content])[0]
|
| 154 |
+
image_embeddings_batch.append(embedding)
|
| 155 |
+
image_metadatas_batch.append(chunk_data)
|
| 156 |
+
|
| 157 |
+
# elif chunk_type == "video_frame":
|
| 158 |
+
# if content and isinstance(content, list) and len(content) > 0:
|
| 159 |
+
# embedding = image_embedder.get_embeddings([content[0]])[0] # Chỉ nhúng ảnh đầu tiên
|
| 160 |
+
# video_frame_embeddings_batch.append(embedding)
|
| 161 |
+
# video_frame_metadatas_batch.append(chunk_data['metadata'])
|
| 162 |
+
|
| 163 |
+
# Xử lý batch
|
| 164 |
+
if len(text_embeddings_batch) >= BATCH_SIZE:
|
| 165 |
+
text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 166 |
+
text_embeddings_batch, text_metadatas_batch = [], [] # Reset batch
|
| 167 |
+
|
| 168 |
+
if len(audio_embeddings_batch) >= BATCH_SIZE:
|
| 169 |
+
audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 170 |
+
audio_embeddings_batch, audio_metadatas_batch = [], [] # Reset batch
|
| 171 |
+
|
| 172 |
+
if len(image_embeddings_batch) >= BATCH_SIZE:
|
| 173 |
+
image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 174 |
+
image_embeddings_batch, image_metadatas_batch = [], [] # Reset batch
|
| 175 |
+
|
| 176 |
+
# if len(video_frame_embeddings_batch) >= BATCH_SIZE:
|
| 177 |
+
# video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
|
| 178 |
+
# video_frame_embeddings_batch, video_frame_metadatas_batch = [], [] # Reset batch
|
| 179 |
+
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logger.error(f"Error processing chunk {chunk_data['metadata']['chunk_id']}: {e}")
|
| 182 |
+
|
| 183 |
+
# Thêm các embedding còn lại trong batch cuối cùng
|
| 184 |
+
if text_embeddings_batch:
|
| 185 |
+
text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 186 |
+
if audio_embeddings_batch:
|
| 187 |
+
audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 188 |
+
if image_embeddings_batch:
|
| 189 |
+
image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 190 |
+
# if video_frame_embeddings_batch:
|
| 191 |
+
# video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
|
| 192 |
+
|
| 193 |
+
logger.success("Finished populating Qdrant collections.")
|
| 194 |
+
logger.info(f"Total vectors in 'text_collection': {text_vector_db_manager.get_total_vectors()}")
|
| 195 |
+
logger.info(f"Total vectors in 'audio_collection': {audio_vector_db_manager.get_total_vectors()}")
|
| 196 |
+
logger.info(f"Total vectors in 'image_collection': {image_vector_db_manager.get_total_vectors()}")
|
| 197 |
+
# logger.info(f"Total vectors in 'video_frame_collection': {video_frame_vector_db_manager.get_total_vectors()}")
|
| 198 |
+
|
| 199 |
+
logger.info("Data ingestion pipeline completed successfully!")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
ingest_data_pipeline()
|
scripts/ingestion.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/ingestion/ingestion_service.py
|
| 2 |
+
import os
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from typing import List, Optional, Callable
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
from utils.logger import logger
|
| 8 |
+
from config.settings import settings
|
| 9 |
+
from qdrant_client import QdrantClient
|
| 10 |
+
|
| 11 |
+
# Import các Processor (không thay đổi)
|
| 12 |
+
from core.data_processing.text_processor import TextProcessor
|
| 13 |
+
from core.data_processing.audio_processor import AudioProcessor
|
| 14 |
+
# from core.data_processing.video_processor import VideoProcessor
|
| 15 |
+
from core.data_processing.image_processor import ImageProcessor
|
| 16 |
+
|
| 17 |
+
# Import các Embedding Model (không thay đổi)
|
| 18 |
+
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
| 19 |
+
from core.embeddings.image_embedding_model import ImageEmbeddingModel
|
| 20 |
+
from core.embeddings.audio_embedding_model import AudioEmbeddingModel
|
| 21 |
+
|
| 22 |
+
# Import VectorDBManager phiên bản Qdrant MỚI
|
| 23 |
+
from core.retrieval.vector_db_manager import VectorDBManager
|
| 24 |
+
|
| 25 |
+
class IngestionService:
|
| 26 |
+
def __init__(self, client: QdrantClient):
|
| 27 |
+
"""
|
| 28 |
+
Khởi tạo IngestionService với một QdrantClient được chia sẻ.
|
| 29 |
+
Phiên bản này không theo dõi trạng thái file.
|
| 30 |
+
"""
|
| 31 |
+
logger.info("Initializing IngestionService (Stateless)...")
|
| 32 |
+
|
| 33 |
+
self.client = client
|
| 34 |
+
self.text_processor = TextProcessor()
|
| 35 |
+
self.image_processor = ImageProcessor()
|
| 36 |
+
self.audio_processor = AudioProcessor()
|
| 37 |
+
# self.video_processor = VideoProcessor()
|
| 38 |
+
self.text_embedder = TextEmbeddingModel()
|
| 39 |
+
self.image_embedder = ImageEmbeddingModel()
|
| 40 |
+
self.audio_embedder = AudioEmbeddingModel()
|
| 41 |
+
|
| 42 |
+
text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
|
| 43 |
+
self.text_db_manager = VectorDBManager(
|
| 44 |
+
client=self.client,
|
| 45 |
+
collection_name="text_collection",
|
| 46 |
+
embedding_dim=text_dim
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
image_embedding_dim = 512
|
| 50 |
+
self.image_vector_db_manager = VectorDBManager(
|
| 51 |
+
client=self.client,
|
| 52 |
+
collection_name="image_collection",
|
| 53 |
+
embedding_dim=image_embedding_dim
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
audio_embedding_dim = 512
|
| 57 |
+
self.audio_vector_db_manager = VectorDBManager(
|
| 58 |
+
client=self.client,
|
| 59 |
+
collection_name="audio_collection",
|
| 60 |
+
embedding_dim=audio_embedding_dim
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# video_frame_embedding_dim = 512
|
| 64 |
+
# video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
logger.info("IngestionService initialized successfully.")
|
| 68 |
+
|
| 69 |
+
def ingest_files(self, file_paths: List[str]):
|
| 70 |
+
"""
|
| 71 |
+
Xử lý một danh sách các file, tạo embedding, và thêm vào Qdrant.
|
| 72 |
+
Hàm này giả định các file đã được đặt vào đúng thư mục trong 'raw'.
|
| 73 |
+
"""
|
| 74 |
+
return self.ingest_files_with_progress(file_paths, None)
|
| 75 |
+
|
| 76 |
+
def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
|
| 77 |
+
"""
|
| 78 |
+
Xử lý một danh sách các file với progress tracking.
|
| 79 |
+
"""
|
| 80 |
+
logger.info(f"Starting ingestion for {len(file_paths)} files...")
|
| 81 |
+
|
| 82 |
+
if progress_callback:
|
| 83 |
+
progress_callback(0.4, desc="Starting file processing...")
|
| 84 |
+
|
| 85 |
+
all_chunks_to_process = []
|
| 86 |
+
|
| 87 |
+
# 1. Quét qua các file và tạo chunk
|
| 88 |
+
for i, file_path in enumerate(file_paths):
|
| 89 |
+
base_progress = 0.4 + (i / len(file_paths)) * 0.3 # 40% -> 70%
|
| 90 |
+
file_name = os.path.basename(file_path)
|
| 91 |
+
|
| 92 |
+
if progress_callback:
|
| 93 |
+
progress_callback(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
|
| 94 |
+
|
| 95 |
+
# Xác định loại dữ liệu dựa trên phần mở rộng file
|
| 96 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
| 97 |
+
data_type = None
|
| 98 |
+
try:
|
| 99 |
+
if progress_callback:
|
| 100 |
+
progress_callback(base_progress + 0.01, desc=f"Reading {file_name}...")
|
| 101 |
+
|
| 102 |
+
if file_ext in ['.txt']:
|
| 103 |
+
data_type = 'text'
|
| 104 |
+
chunks = self.text_processor.process(file_path)
|
| 105 |
+
elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
|
| 106 |
+
data_type = 'image'
|
| 107 |
+
chunks = self.image_processor.process(file_path)
|
| 108 |
+
elif file_ext in ['.wav', '.mp3']:
|
| 109 |
+
data_type = 'audio'
|
| 110 |
+
chunks = self.audio_processor.process(file_path)
|
| 111 |
+
# elif file_ext in ['.mp4', '.avi', '.mov']:
|
| 112 |
+
# data_type = 'video'
|
| 113 |
+
# chunks = self.video_processor.process_video(file_path)
|
| 114 |
+
else:
|
| 115 |
+
logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
if progress_callback:
|
| 119 |
+
progress_callback(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
|
| 120 |
+
|
| 121 |
+
all_chunks_to_process.extend(chunks)
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Error processing file {file_path}: {e}")
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
if not all_chunks_to_process:
|
| 128 |
+
logger.warning("No processable chunks were generated from the provided files.")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
|
| 132 |
+
|
| 133 |
+
if progress_callback:
|
| 134 |
+
progress_callback(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
|
| 135 |
+
|
| 136 |
+
# 2. Tạo embedding và thêm vào Qdrant (theo batch)
|
| 137 |
+
text_embeddings_batch, text_metadatas_batch = [], []
|
| 138 |
+
audio_embeddings_batch, audio_metadatas_batch = [], []
|
| 139 |
+
image_embeddings_batch, image_metadatas_batch = [], []
|
| 140 |
+
BATCH_SIZE = 32
|
| 141 |
+
|
| 142 |
+
for i, chunk_data in enumerate(all_chunks_to_process):
|
| 143 |
+
# Tính toán progress chi tiết hơn
|
| 144 |
+
base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25 # 70% -> 95%
|
| 145 |
+
|
| 146 |
+
chunk_type = chunk_data['metadata']['type']
|
| 147 |
+
content = chunk_data['content']
|
| 148 |
+
chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
if progress_callback:
|
| 152 |
+
progress_callback(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
|
| 153 |
+
|
| 154 |
+
embedding = None
|
| 155 |
+
if chunk_type == "text":
|
| 156 |
+
if progress_callback:
|
| 157 |
+
progress_callback(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
|
| 158 |
+
embedding = self.text_embedder.get_embeddings([content])[0]
|
| 159 |
+
text_embeddings_batch.append(embedding)
|
| 160 |
+
text_metadatas_batch.append(chunk_data)
|
| 161 |
+
elif chunk_type == "audio":
|
| 162 |
+
if progress_callback:
|
| 163 |
+
progress_callback(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
|
| 164 |
+
embedding = self.audio_embedder.get_embeddings([content])[0]
|
| 165 |
+
audio_embeddings_batch.append(embedding)
|
| 166 |
+
audio_metadatas_batch.append(chunk_data)
|
| 167 |
+
elif chunk_type == "image":
|
| 168 |
+
if progress_callback:
|
| 169 |
+
progress_callback(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
|
| 170 |
+
embedding = self.image_embedder.get_embeddings([content])[0]
|
| 171 |
+
image_embeddings_batch.append(embedding)
|
| 172 |
+
image_metadatas_batch.append(chunk_data)
|
| 173 |
+
|
| 174 |
+
# Thêm batch khi đủ kích thước với progress update
|
| 175 |
+
if len(text_embeddings_batch) >= BATCH_SIZE:
|
| 176 |
+
if progress_callback:
|
| 177 |
+
progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
|
| 178 |
+
self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 179 |
+
text_embeddings_batch, text_metadatas_batch = [], []
|
| 180 |
+
|
| 181 |
+
if len(audio_embeddings_batch) >= BATCH_SIZE:
|
| 182 |
+
if progress_callback:
|
| 183 |
+
progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
|
| 184 |
+
self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 185 |
+
audio_embeddings_batch, audio_metadatas_batch = [], []
|
| 186 |
+
|
| 187 |
+
if len(image_embeddings_batch) >= BATCH_SIZE:
|
| 188 |
+
if progress_callback:
|
| 189 |
+
progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
|
| 190 |
+
self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 191 |
+
image_embeddings_batch, image_metadatas_batch = [], []
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Error ingesting chunk {chunk_id}: {e}")
|
| 195 |
+
|
| 196 |
+
if progress_callback:
|
| 197 |
+
progress_callback(0.95, desc="Saving final batches...")
|
| 198 |
+
|
| 199 |
+
# Thêm các embedding còn lại trong batch cuối cùng
|
| 200 |
+
final_operations = []
|
| 201 |
+
if text_embeddings_batch:
|
| 202 |
+
final_operations.append(("text", len(text_embeddings_batch)))
|
| 203 |
+
if audio_embeddings_batch:
|
| 204 |
+
final_operations.append(("audio", len(audio_embeddings_batch)))
|
| 205 |
+
if image_embeddings_batch:
|
| 206 |
+
final_operations.append(("image", len(image_embeddings_batch)))
|
| 207 |
+
|
| 208 |
+
for i, (batch_type, count) in enumerate(final_operations):
|
| 209 |
+
current_progress = 0.95 + (i / len(final_operations)) * 0.04 # 95% -> 99%
|
| 210 |
+
|
| 211 |
+
if batch_type == "text" and text_embeddings_batch:
|
| 212 |
+
if progress_callback:
|
| 213 |
+
progress_callback(current_progress, desc=f"Saving final {count} text embeddings...")
|
| 214 |
+
self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 215 |
+
elif batch_type == "audio" and audio_embeddings_batch:
|
| 216 |
+
if progress_callback:
|
| 217 |
+
progress_callback(current_progress, desc=f"Saving final {count} audio embeddings...")
|
| 218 |
+
self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 219 |
+
elif batch_type == "image" and image_embeddings_batch:
|
| 220 |
+
if progress_callback:
|
| 221 |
+
progress_callback(current_progress, desc=f"Saving final {count} image embeddings...")
|
| 222 |
+
self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 223 |
+
|
| 224 |
+
if progress_callback:
|
| 225 |
+
progress_callback(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
|
| 226 |
+
|
| 227 |
+
logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")
|
utils/logger.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from loguru import logger
|
| 3 |
+
from config.settings import settings # Import settings của chúng ta
|
| 4 |
+
|
| 5 |
+
# Cấu hình logger
|
| 6 |
+
logger.remove() # Gỡ bỏ cấu hình mặc định
|
| 7 |
+
logger.add(
|
| 8 |
+
"logs/file_{time}.log", # Lưu log vào tệp, với tên tệp theo thời gian
|
| 9 |
+
rotation="10 MB", # Xoay tệp log khi đạt 10MB
|
| 10 |
+
compression="zip", # Nén tệp log cũ
|
| 11 |
+
level=settings.LOG_LEVEL, # Mức độ log từ settings
|
| 12 |
+
colorize=True, # Tô màu output trên console
|
| 13 |
+
format="{time} {level} {message}",
|
| 14 |
+
enqueue=True # Sử dụng hàng đợi để ghi log không chặn (quan trọng cho các ứng dụng đa luồng/async)
|
| 15 |
+
)
|
| 16 |
+
logger.add(
|
| 17 |
+
sys.stderr, # Ghi log ra console
|
| 18 |
+
level=settings.LOG_LEVEL,
|
| 19 |
+
colorize=True,
|
| 20 |
+
format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Xuất logger để các module khác có thể import và sử dụng
|
| 24 |
+
__all__ = ["logger"]
|