Spaces:

trungmin
/

Multimodal-RAG

Sleeping

App Files Files Community

3v324v23 commited on Aug 3

Commit

fcb8b13

1 Parent(s): 3966beb

Add application file

Browse files

Files changed (18) hide show

.gitignore +207 -0
app.py +274 -0
config/database_configs.py +0 -0
config/model_configs.py +17 -0
config/settings.py +36 -0
core/data_processing/audio_processor.py +80 -0
core/data_processing/image_processor.py +89 -0
core/data_processing/text_processor.py +64 -0
core/data_processing/video_processor.py +118 -0
core/embeddings/audio_embedding_model.py +86 -0
core/embeddings/image_embedding_model.py +80 -0
core/embeddings/text_embedding_model.py +42 -0
core/retrieval/retriever.py +156 -0
core/retrieval/vector_db_manager.py +169 -0
requirements.txt +125 -0
scripts/ingest_data.py +203 -0
scripts/ingestion.py +227 -0
utils/logger.py +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,207 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

app.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# app/main.py
+import gradio as gr
+import os
+import sys
+import shutil
+import zipfile
+from typing import List, Dict, Any
+from pathlib import Path
+# Thêm thư mục gốc của dự án vào Python Path để có thể import các module
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+from utils.logger import logger
+from config.settings import settings
+from qdrant_client import QdrantClient
+from core.retrieval.retriever import Retriever
+from scripts.ingestion import IngestionService
+# --- 1. Khởi tạo các dịch vụ toàn cục ---
+logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
+try:
+    # Tạo MỘT QdrantClient duy nhất để chia sẻ
+    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+    shared_qdrant_client = QdrantClient(path=qdrant_db_path)
+    logger.info("Shared Qdrant client initialized.")
+    # Khởi tạo các dịch vụ, chia sẻ client
+    ingestion_service = IngestionService(client=shared_qdrant_client)
+    retriever_instance = Retriever(client=shared_qdrant_client)
+    logger.info("All services initialized successfully.")
+except Exception as e:
+    logger.error(f"Failed to initialize global services: {e}")
+    raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
+# ---- HÀM XỬ LÝ CHO TAB UPLOAD ----
+def upload_handler(zip_path: str, progress=gr.Progress()):
+    """
+    Hàm này xử lý việc upload file và thư mục với progress bar.
+    """
+    progress(0, desc="🚀 Starting upload process...")
+    if not zip_path:
+        return "Error: No file uploaded"
+    if not zip_path.endswith(".zip"):
+        return "Error: Please upload a zip file"
+    progress(0.05, desc="📦 Extracting ZIP file...")
+    try:
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(settings.RAW_DATA_DIR)
+    except zipfile.BadZipFile:
+        return "Invalid ZIP file."
+    except Exception as e:
+        return f"Error during extraction: {str(e)}"
+    progress(0.15, desc="🔍 Scanning for files...")
+    logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
+    # --- Bước 1: Thu thập tất cả các đường dẫn file từ input ---
+    path = Path(settings.RAW_DATA_DIR)
+    all_temp_file_paths = list(path.rglob("*"))
+    all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
+    progress(0.25, desc="📊 Analyzing files...")
+    if not all_temp_file_paths:
+        return "No valid files found in the uploaded items."
+    logger.info(f"Found a total of {len(all_temp_file_paths)} files to process.")
+    progress(0.35, desc=f"✅ Found {len(all_temp_file_paths)} files to process...")
+    files_to_ingest = all_temp_file_paths.copy()
+    if not files_to_ingest:
+        return "No valid files were moved for ingestion."
+    # --- Bước 3: Gọi dịch vụ để nạp tất cả các file một lần ---
+    try:
+        progress(0.4, desc="🔄 Starting file ingestion...")
+        # Gọi hàm ingestion với progress callback
+        ingestion_service.ingest_files_with_progress(files_to_ingest)
+        success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
+        logger.success(success_message)
+        return success_message
+    except Exception as e:
+        error_message = f"An error occurred during the ingestion process: {e}"
+        logger.error(error_message)
+        return error_message
+# ---- HÀM XỬ LÝ CHO TAB SEARCH ----
+def search_handler(text_query: str, image_query_path: str, audio_query_path: str, top_k: int):
+    def create_empty_updates(max_results=10):
+        updates = []
+        for _ in range(max_results):
+            # Chỉ 5 components cho mỗi result: group, markdown, textbox, image, audio
+            updates.extend([
+                gr.Group(visible=False),
+                gr.Markdown(visible=False),
+                gr.Textbox(visible=False),
+                gr.Image(visible=False),
+                gr.Audio(visible=False)
+            ])
+        return updates
+    # Kiểm tra database trước khi xử lý query
+    try:
+        if retriever_instance.is_database_empty():
+            empty_db_message = gr.Textbox(
+                value="Database is empty. Please go to the 'Upload Data' tab to add files first.",
+                visible=True
+            )
+            return [empty_db_message] + create_empty_updates()
+    except Exception as e:
+        error_message = gr.Textbox(
+            value=f"Error checking database: {str(e)}",
+            visible=True
+        )
+        return [error_message] + create_empty_updates()
+    query_type, query_content = None, None
+    if text_query and text_query.strip():
+        query_type, query_content = "text", text_query
+    elif image_query_path:
+        query_type, query_content = "image", image_query_path
+    elif audio_query_path:
+        query_type, query_content = "audio", audio_query_path
+    max_results = 10 # Phải khớp với số component đã tạo
+    if not query_type:
+        return [gr.Textbox(value="Error: Please provide a query.", visible=True)] + create_empty_updates()
+    try:
+        logger.info(f"Handling '{query_type}' query: {query_content}")
+        results = retriever_instance.retrieve(query_content, query_type, int(top_k))
+        if not results:
+            return [gr.Textbox(value="No results found.", visible=True)] + create_empty_updates()
+        output_updates = [gr.Textbox(value="", visible=False)] # Ẩn ô info_box
+        for i in range(max_results):
+            if i < len(results):
+                res = results[i]
+                score, metadata, content = res['score'], res['metadata'], res.get('content')
+                chunk_type, source_id = metadata.get('type', 'N/A'), metadata.get('source_id', 'N/A')
+                info_text = f"### Result {i + 1} (Score: {score:.4f})\n**Type:** `{chunk_type}` | **Source:** `{source_id}`"
+                text_val, text_visible = "", False
+                img_val, img_visible = None, False
+                audio_val, audio_visible = None, False
+                if chunk_type == 'text':
+                    text_val, text_visible = content, True
+                elif chunk_type == "image":
+                    if content and os.path.exists(content):
+                        img_val, img_visible = content, True
+                    else:
+                        text_val, text_visible = "`Image content not found at path.`", True
+                elif chunk_type == 'audio':
+                    if content and os.path.exists(content):
+                        audio_val, audio_visible = content, True
+                    else:
+                        text_val, text_visible = "`Audio content not found at path.`", True
+                # Chỉ 5 components cho mỗi result
+                output_updates.extend([
+                    gr.Group(visible=True),
+                    gr.Markdown(value=info_text, visible=True),
+                    gr.Textbox(value=text_val, visible=text_visible),
+                    gr.Image(value=img_val, visible=img_visible),
+                    gr.Audio(value=audio_val, visible=audio_visible)
+                ])
+            else:
+                # Chỉ 5 components cho mỗi result
+                output_updates.extend([
+                    gr.Group(visible=False),
+                    gr.Markdown(visible=False),
+                    gr.Textbox(visible=False),
+                    gr.Image(visible=False),
+                    gr.Audio(visible=False)
+                ])
+        return output_updates
+    except Exception as e:
+        error_message = f"Error during search: {str(e)}"
+        logger.error(error_message)
+        return [gr.Textbox(value=error_message, visible=True)] + create_empty_updates()
+# --- 3. Xây dựng giao diện với Gradio Blocks ---
+def create_and_run_app():
+    with gr.Blocks(theme=gr.themes.Soft(), title="Multimedia RAG Assistant") as demo:
+        gr.Markdown("# Multimedia RAG Assistant")
+        with gr.Tabs() as tabs:
+            # --- TAB 1: SEARCH ---
+            with gr.TabItem("Search Database", id=0):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### Input Query")
+                        text_query_input = gr.Textbox(label="Text Query", placeholder="e.g., a dog playing in a park")
+                        image_query_input = gr.Image(label="Image Query", type="filepath")
+                        audio_query_input = gr.Audio(label="Audio Query", type="filepath")
+                        top_k_slider = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Top K Results")
+                        search_button = gr.Button("Search", variant="primary")
+                    with gr.Column(scale=2):
+                        gr.Markdown("### Retrieval Results")
+                        info_box = gr.Textbox(label="Info", interactive=False, visible=False)
+                        max_results = 10
+                        result_components = []
+                        for i in range(max_results):
+                            with gr.Group(visible=False) as result_group:
+                                result_info = gr.Markdown()
+                                result_text = gr.Textbox(label="Text Content", interactive=False, visible=False)
+                                result_image = gr.Image(label="Image Content", interactive=False, visible=False)
+                                result_audio = gr.Audio(label="Audio Content", visible=False, type="filepath")
+                                # Chỉ thêm 5 components cho mỗi result
+                                result_components.extend([result_group, result_info, result_text, result_image, result_audio])
+                        all_outputs = [info_box] + result_components
+                        search_button.click(
+                            fn=search_handler,
+                            inputs=[text_query_input, image_query_input, audio_query_input, top_k_slider],
+                            outputs=all_outputs
+                        )
+            # --- TAB 2: UPLOAD ---
+            with gr.TabItem("Upload Data", id=1):
+                gr.Markdown("### Upload New Data to the Database")
+                gr.Markdown("You can upload multiple files of different types at once (text, images, audio), or drop a folder.")
+                with gr.Column():
+                    upload_file_input = gr.File(
+                        label="Upload ZIP file containing your data",
+                        file_types=[".zip"],
+                        file_count="single",
+                        type="filepath"
+                    )
+                    upload_button = gr.Button("Upload and Ingest", variant="primary")
+                    upload_status = gr.Textbox(label="Status", interactive=False, placeholder="Upload status will be shown here...")
+                    upload_button.click(
+                        fn=upload_handler,
+                        inputs=[upload_file_input],
+                        outputs=[upload_status],
+                        show_progress="full"  # Hiển thị progress bar
+                    )
+        # Xử lý sự kiện để xóa các input khác trong tab Search
+        def clear_search_inputs(input_type):
+            if input_type == 'text': return gr.Image(value=None), gr.Audio(value=None)
+            elif input_type == 'image': return gr.Textbox(value=""), gr.Audio(value=None)
+            elif input_type == 'audio': return gr.Textbox(value=""), gr.Image(value=None)
+        text_query_input.change(lambda: clear_search_inputs('text'), outputs=[image_query_input, audio_query_input], queue=False)
+        image_query_input.change(lambda: clear_search_inputs('image'), outputs=[text_query_input, audio_query_input], queue=False)
+        audio_query_input.change(lambda: clear_search_inputs('audio'), outputs=[text_query_input, image_query_input], queue=False)
+    return demo
+# --- 4. Chạy ứng dụng ---
+if __name__ == "__main__":
+    logger.info("Launching Gradio interface...")
+    demo = create_and_run_app()
+    demo.launch()

config/database_configs.py ADDED Viewed

File without changes

config/model_configs.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# config/model_configs.py
+# Embedding Models
+TEXT_EMBEDDING_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+IMAGE_EMBEDDING_MODEL: str = "openai/clip-vit-base-patch32" # Hoặc các mô hình CLIP khác
+AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused" # Ví dụ về mô hình CLAP
+# Generator Model (LLM/LMM)
+GENERATOR_MODEL_NAME: str = "gpt-4o" # Hoặc "google/gemma-2b", "meta-llama/Llama-2-7b-chat-hf", "llava-hf/llava-1.5-7b-hf"
+GENERATOR_MODEL_MAX_TOKENS: int = 4096
+GENERATOR_MODEL_TEMPERATURE: float = 0.7
+# Reranker Model
+RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+# Automatic Speech Recognition (ASR) Model (Ví dụ với Whisper của Hugging Face)
+ASR_MODEL: str = "openai/whisper-tiny" # Có thể dùng "base", "small", "medium" tùy tài nguyên GPU

config/settings.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+import os
+from typing import Optional
+from dotenv import load_dotenv
+load_dotenv()
+class Settings(BaseSettings):
+    APP_NAME: str = "Multimedia RAG Assistant"
+    APP_VERSION: str = "0.1.0"
+    ENVIRONMENT: str = os.getenv('ENVIRONMENT')
+    DATA_DIR: str = "data"
+    RAW_DATA_DIR: str = os.path.join("data", "raw")
+    PROCESSED_DATA_DIR: str = os.path.join("data", "processed")
+    CHUNKS_DIR: str = os.path.join("data", "processed", "chunks")
+    METADATA_DIR: str = os.path.join("data", "processed", "metadata")
+    EMBEDDINGS_DIR: str = os.path.join("data", "processed", "embeddings")
+    API_HOST: str = "0.0.0.0"
+    API_PORT: int = 8000
+    # Cấu hình mô hình
+    # Đây là nơi bạn sẽ thêm các API key hoặc model IDs sau này
+    HUGGINGFACE_API_KEY: Optional[str] = os.getenv('HUGGINGFACE_API_KEY') # Ví dụ: Nếu dùng Hugging Face models
+    # Cấu hình logger
+    LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        extra="ignore",
+        case_sensitive=True
+    )
+settings = Settings()

core/data_processing/audio_processor.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# core/data_processing/audio_processor.py
+import os
+from typing import List, Dict, Any
+from utils.logger import logger
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from config.settings import settings
+class AudioProcessor:
+    def __init__(self, min_silence_len: int = 1000, silence_thresh_db: int = -40, target_sr: int = 16000):
+        self.min_silence_len = min_silence_len
+        self.silence_thresh_db = silence_thresh_db
+        self.target_sr = target_sr
+        logger.info(f"AudioProcessor initialized (min_silence_len={min_silence_len}ms, silence_thresh_db={silence_thresh_db}dB).")
+    def process(self, file_path: str) -> List[Dict[str, Any]]:
+        try:
+            logger.info(f"Processing audio file: {file_path}")
+            audio = AudioSegment.from_file(file_path)
+            if audio.frame_rate != self.target_sr:
+                audio = audio.set_frame_rate(self.target_sr)
+            audio_segments = split_on_silence(
+                audio,
+                min_silence_len=self.min_silence_len,
+                silence_thresh=self.silence_thresh_db,
+                keep_silence=500
+            )
+            chunks = []
+            audio_chunks_dir = os.path.join(settings.CHUNKS_DIR, "audio")
+            os.makedirs(audio_chunks_dir, exist_ok=True)
+            for i, segment in enumerate(audio_segments):
+                segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
+                chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
+                # Lưu segment thành file WAV tạm thời
+                segment.export(chunk_file_path, format="wav")
+                metadata = {
+                    "source_id": os.path.basename(file_path),
+                    "type": "audio",
+                    "chunk_id": segment_id,
+                    "chunk_data_path": chunk_file_path,
+                    # "start_time_ms": int(segment.start_time),
+                    # "end_time_ms": int(segment.end_time),
+                    "duration_ms": len(segment)
+                }
+                chunks.append({
+                    "content": chunk_file_path,
+                    "metadata": metadata
+                })
+            logger.info(f"Generated {len(chunks)} audio segments from {file_path}")
+            return chunks
+        except FileNotFoundError:
+            logger.error(f"Audio file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
+            return []
+        except Exception as e:
+            logger.error(f"Error processing audio file {file_path}: {e}")
+            return []
+# Ví dụ sử dụng (giữ nguyên để kiểm tra)
+if __name__ == "__main__":
+    sample_audio_path = os.path.join(settings.RAW_DATA_DIR, "audios", "sample_audio.wav")
+    if not os.path.exists(sample_audio_path):
+        print(f"ERROR: Sample audio not found at {sample_audio_path}. Please create it first.")
+        print("Make sure you have ffmpeg installed and available in your PATH for pydub to work.")
+    else:
+        processor = AudioProcessor()
+        audio_chunks = processor.process(sample_audio_path)
+        for i, chunk in enumerate(audio_chunks):
+            print(f"\n--- Audio Chunk {i+1} ---")
+            print(f"Type: {chunk['metadata']['type']}")
+            print(f"Content (path): {chunk['content']}")
+            print(f"Metadata: {chunk['metadata']}")
+            # Bạn có thể thử mở file chunk['content'] để nghe

core/data_processing/image_processor.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# core/data_processing/image_processor.py
+from typing import List, Dict, Any
+import os
+from PIL import Image
+from utils.logger import logger
+class ImageProcessor:
+    def __init__(self):
+        logger.info("ImageProcessor initialized.")
+    def process(self, file_path: str) -> List[Dict[str, Any]]:
+        try:
+            logger.debug(f"Processing image file: {file_path}")
+            if not os.path.exists(file_path):
+                logger.error(f"Image file not found: {file_path}")
+                return []
+            with Image.open(file_path) as img:
+                width, height = img.size
+                img_format = img.format
+            file_size = os.path.getsize(file_path)
+            # Tạo một ID duy nhất cho chunk này
+            # Lấy tên file không bao gồm phần mở rộng
+            base_name = os.path.basename(file_path)
+            chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
+            metadata = {
+                "source_id": base_name,
+                "type": "image",
+                "chunk_id": chunk_id,
+                "chunk_data_path": file_path,
+                "image_width": width,
+                "image_height": height,
+                "image_format": img_format,
+                "file_size_bytes": file_size
+            }
+            # Tạo chunk
+            # Content sẽ là đường dẫn đến file, giống như audio/video segments
+            chunk = {
+                "content": file_path,
+                "metadata": metadata
+            }
+            # Trả về một danh sách chứa một chunk duy nhất
+            return [chunk]
+        except Exception as e:
+            logger.error(f"Error processing image file {file_path}: {e}")
+            return []
+# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
+if __name__ == "__main__":
+    from config.settings import settings
+    import os
+    # Tạo một ảnh dummy để kiểm tra
+    dummy_image_dir = os.path.join(settings.RAW_DATA_DIR, "images")
+    os.makedirs(dummy_image_dir, exist_ok=True)
+    dummy_image_path = os.path.join(dummy_image_dir, "test_image.jpg")
+    try:
+        # Tạo một ảnh mẫu màu xanh
+        dummy_img = Image.new('RGB', (100, 150), color = 'blue')
+        dummy_img.save(dummy_image_path)
+        print(f"Created a dummy image for testing at: {dummy_image_path}")
+        # Khởi tạo processor và xử lý ảnh
+        processor = ImageProcessor()
+        image_chunks = processor.process(dummy_image_path)
+        if image_chunks:
+            print("\n--- Image Chunk Processed ---")
+            chunk = image_chunks[0]
+            print(f"Content (path): {chunk['content']}")
+            print("Metadata:")
+            for key, value in chunk['metadata'].items():
+                print(f"  - {key}: {value}")
+        else:
+            print("Failed to process the dummy image.")
+    finally:
+        # Dọn dẹp ảnh dummy
+        if os.path.exists(dummy_image_path):
+            os.remove(dummy_image_path)
+            print(f"Cleaned up dummy image: {dummy_image_path}")

core/data_processing/text_processor.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from typing import List, Dict, Any
+from utils.logger import logger
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+class TextProcessor:
+    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=len, # count character, can be replaced
+            add_start_index=True #
+        )
+        logger.info(f"TextProcessor initialized with LangChain's RecursiveCharacterTextSplitter (chunk_size={chunk_size}, chunk_overlap={chunk_overlap})")
+    def process(self, file_path: str) -> List[Dict[str, Any]]:
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+            logger.info(f"Processing text document: {file_path}")
+            split_texts = self.text_splitter.split_text(text)
+            chunks = []
+            for i, chunk_content in enumerate(split_texts):
+                start_char_idx = text.find(chunk_content) # find start index of each chunk_content
+                chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
+                metadata = {
+                    "source_id": os.path.basename(file_path),
+                    "type": "text",
+                    "chunk_id": chunk_id,
+                    "start_char_index": start_char_idx, # Vị trí ký tự bắt đầu
+                    "end_char_index": start_char_idx + len(chunk_content), # Vị trí ký tự kết thúc
+                    "content_length": len(chunk_content)
+                }
+                chunks.append({
+                    "content": chunk_content,
+                    "metadata": metadata
+                })
+            logger.info(f"Generated {len(chunks)} text chunks from {file_path}")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error processing text document {file_path}: {e}")
+            return []
+# Ví dụ sử dụng (giữ nguyên để kiểm tra)
+if __name__ == "__main__":
+    from config.settings import settings
+    import os
+    sample_doc_path = os.path.join(settings.RAW_DATA_DIR, "documents", "sample_document.txt")
+    if not os.path.exists(sample_doc_path):
+        print(f"ERROR: Sample document not found at {sample_doc_path}. Please create it first.")
+    else:
+        processor = TextProcessor(chunk_size=100, chunk_overlap=20) # Thử kích thước nhỏ hơn để thấy rõ chunk
+        text_chunks = processor.process(sample_doc_path)
+        for i, chunk in enumerate(text_chunks): # In tất cả các chunk để kiểm tra
+            print(f"\n--- Chunk {i+1} ---")
+            print(f"Content: {chunk['content']}") # In toàn bộ nội dung chunk
+            print(f"Metadata: {chunk['metadata']}")

core/data_processing/video_processor.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# core/data_processing/video_processor.py
+import os
+import torch
+import cv2
+import numpy as np
+from typing import List, Dict, Any
+from utils.logger import logger
+from moviepy.editor import VideoFileClip
+from config.settings import settings
+class VideoProcessor:
+    def __init__(self, chunk_duration_sec: int = 10, frames_per_segment: int = 5):
+        self.chunk_duration_sec = chunk_duration_sec
+        self.frames_per_segment = frames_per_segment
+        logger.info(f"VideoProcessor initialized (chunk_duration={chunk_duration_sec}s, frames_per_segment={frames_per_segment}).")
+    def process_video(self, file_path: str) -> List[Dict[str, Any]]:
+        try:
+            logger.info(f"Processing video file: {file_path}")
+            video_clip = VideoFileClip(file_path)
+            total_duration = video_clip.duration # Tổng thời lượng video (giây)
+            all_chunks = []
+            # Tạo thư mục con để lưu các frame/ảnh tạm thời
+            image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks", os.path.basename(file_path).split('.')[0])
+            os.makedirs(image_chunks_dir, exist_ok=True)
+            # Tạo thư mục con để lưu các video segment tạm thời
+            video_segments_dir = os.path.join(settings.CHUNKS_DIR, "video/video_segments", os.path.basename(file_path).split('.')[0])
+            os.makedirs(video_segments_dir, exist_ok=True)
+            current_time = 0.0
+            chunk_idx = 0
+            while current_time < total_duration:
+                end_time = min(current_time + self.chunk_duration_sec, total_duration) # end time of each segment
+                segment_clip = video_clip.subclip(current_time, end_time)
+                segment_base_name = f"{os.path.basename(file_path).split('.')[0]}_segment_{chunk_idx}"
+                frames_paths = []
+                frame_timestamps = np.linspace(0, segment_clip.duration, self.frames_per_segment + 2)[1:-1]
+                for ts in frame_timestamps:
+                    frame = segment_clip.get_frame(ts)
+                    frame_filename = f"{segment_base_name}_frame_{int(ts*1000)}.jpg"
+                    frame_path = os.path.join(image_chunks_dir, frame_filename)
+                    cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+                    frames_paths.append(frame_path)
+                # Tạo chunk cho các khung hình
+                image_chunk_id = f"{segment_base_name}_image"
+                all_chunks.append({
+                    "content": frames_paths, # Danh sách đường dẫn đến các file ảnh
+                    "metadata": {
+                        "source_id": os.path.basename(file_path),
+                        "type": "video_frame", # Loại chunk
+                        "chunk_id": image_chunk_id,
+                        "start_time_sec": current_time,
+                        "end_time_sec": end_time,
+                        "frame_paths": frames_paths # Lưu lại đường dẫn trong metadata
+                    }
+                })
+                # 2. Lưu đoạn video clip (optional, nhưng hữu ích cho video retrieval)
+                video_segment_path = os.path.join(video_segments_dir, f"{segment_base_name}.mp4")
+                segment_clip.write_videofile(video_segment_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
+                # Tạo chunk cho video segment
+                video_chunk_id = f"{segment_base_name}_video_clip"
+                all_chunks.append({
+                    "content": video_segment_path, # Đường dẫn đến file video clip
+                    "metadata": {
+                        "source_id": os.path.basename(file_path),
+                        "type": "video_segment_clip", # Loại chunk mới: video clip
+                        "chunk_id": video_chunk_id,
+                        "start_time_sec": current_time,
+                        "end_time_sec": end_time,
+                        "chunk_data_path": video_segment_path # Lưu lại đường dẫn trong metadata
+                    }
+                })
+                current_time = end_time
+                chunk_idx += 1
+            video_clip.close() # Đảm bảo giải phóng tài nguyên
+            logger.info(f"Generated {len(all_chunks)} chunks (frames & video segments) from video {file_path}")
+            return all_chunks
+        except FileNotFoundError:
+            logger.error(f"Video file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
+            return []
+        except Exception as e:
+            logger.error(f"Error processing video file {file_path}: {e}")
+            return []
+# Ví dụ sử dụng (giữ nguyên để kiểm tra)
+if __name__ == "__main__":
+    sample_video_path = os.path.join(settings.RAW_DATA_DIR, "videos", "sample_video.mp4")
+    if not os.path.exists(sample_video_path):
+        print(f"ERROR: Sample video not found at {sample_video_path}. Please create it first.")
+        print("Make sure you have ffmpeg installed and available in your PATH for moviepy to work.")
+    else:
+        processor = VideoProcessor(chunk_duration_sec=5, frames_per_segment=3)
+        video_chunks = processor.process_video(sample_video_path)
+        for i, chunk in enumerate(video_chunks):
+            print(f"\n--- Video Chunk {i+1} ---")
+            print(f"Type: {chunk['metadata']['type']}")
+            if chunk['metadata']['type'] == 'video_frames':
+                print(f"Content (paths): {chunk['content']}")
+                if chunk['content']:
+                    print(f"Sample frame: {chunk['content'][0]}")
+            elif chunk['metadata']['type'] == 'video_segment_clip':
+                print(f"Content (path): {chunk['content']}")
+            print(f"Metadata: {chunk['metadata']}")

core/embeddings/audio_embedding_model.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# models/embeddings/audio_embedding_model.py
+import torch
+import librosa
+import numpy as np
+from typing import List
+from transformers import AutoProcessor, AutoModel
+from utils.logger import logger
+from config.model_configs import AUDIO_EMBEDDING_MODEL
+class AudioEmbeddingModel:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Loading Audio Embedding Model '{AUDIO_EMBEDDING_MODEL}' to device: {self.device}")
+        self.processor = AutoProcessor.from_pretrained(AUDIO_EMBEDDING_MODEL)
+        self.model = AutoModel.from_pretrained(AUDIO_EMBEDDING_MODEL).to(self.device)
+        logger.info("Audio Embedding Model loaded successfully.")
+    def get_embeddings(self, audio_paths: List[str]) -> List[List[float]]:
+        if not audio_paths:
+            return []
+        audio_inputs = []
+        sample_rate = self.processor.feature_extractor.sampling_rate
+        for audio_path in audio_paths:
+            try:
+                audio_data, sr = librosa.load(audio_path, sr=sample_rate)
+                audio_inputs.append(audio_data)
+            except Exception as e:
+                logger.warning(f"Could not load audio {audio_path}: {e}. Skipping.")
+                continue
+        if not audio_inputs:
+            return []
+        inputs = self.processor(audios=audio_inputs, sampling_rate=sample_rate, return_tensors="pt", padding=True).to(self.device)
+        with torch.no_grad():
+            audio_features = self.model.get_audio_features(**inputs)
+        embeddings = audio_features / audio_features.norm(p=2, dim=-1, keepdim=True)
+        embeddings_list = embeddings.cpu().tolist()
+        logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
+        return embeddings_list
+# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
+if __name__ == "__main__":
+    from config.settings import settings
+    import os
+    model = AudioEmbeddingModel()
+    sample_audio_dir = os.path.join(settings.PROCESSED_DATA_DIR, "audio_segments", "sample_audio") # Giả sử có thư mục audio từ file mẫu
+    # Tạo một audio dummy nếu không có file audio mẫu
+    if not os.path.exists(sample_audio_dir) or not os.listdir(sample_audio_dir):
+        print(f"Creating a dummy audio for testing at {sample_audio_dir}...")
+        os.makedirs(sample_audio_dir, exist_ok=True)
+        from pydub import AudioSegment
+        dummy_audio = AudioSegment.silent(duration=1000) # 1 giây im lặng
+        dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
+        dummy_audio.export(dummy_audio_path, format="wav")
+        sample_audio_paths = [dummy_audio_path]
+    else:
+        sample_audio_paths = [os.path.join(sample_audio_dir, f) for f in os.listdir(sample_audio_dir) if f.endswith(('.wav', '.mp3'))]
+        if not sample_audio_paths:
+            print(f"No audio files found in {sample_audio_dir}. Please ensure sample audio was processed.")
+            from pydub import AudioSegment
+            dummy_audio = AudioSegment.silent(duration=1000)
+            dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
+            dummy_audio.export(dummy_audio_path, format="wav")
+            sample_audio_paths = [dummy_audio_path]
+    print(f"Using {len(sample_audio_paths)} sample audio clips: {sample_audio_paths[:2]}...")
+    embeddings = model.get_embeddings(sample_audio_paths)
+    print(f"Number of embeddings: {len(embeddings)}")
+    if embeddings:
+        print(f"Dimension of embeddings: {len(embeddings[0])}")
+        print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
+        if len(embeddings) > 1:
+            from sklearn.metrics.pairwise import cosine_similarity
+            sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
+            print(f"Similarity between audio 1 and 2: {sim:.4f}")

core/embeddings/image_embedding_model.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from typing import List
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+from utils.logger import logger
+from config.model_configs import IMAGE_EMBEDDING_MODEL
+class ImageEmbeddingModel:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
+        self.model = CLIPModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
+        self.processor = CLIPProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
+        logger.info("Image Embedding Model loaded successfully.")
+    def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
+        if not image_paths:
+            return []
+        images = []
+        for img_path in image_paths:
+            try:
+                images.append(Image.open(img_path).convert("RGB"))
+            except Exception as e:
+                logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
+                continue
+        if not images:
+            return []
+        inputs = self.processor(images=images, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            image_features = self.model.get_image_features(pixel_values=inputs.pixel_values)
+        embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+        embeddings_list = embeddings.cpu().tolist()
+        logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
+        return embeddings_list
+# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
+if __name__ == "__main__":
+    from config.settings import settings
+    import os
+    model = ImageEmbeddingModel()
+    sample_image_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks/sample_video") # Giả sử có thư mục ảnh từ video mẫu
+    # Tạo một ảnh dummy nếu không có ảnh mẫu
+    if not os.path.exists(sample_image_dir) or not os.listdir(sample_image_dir):
+        print(f"Creating a dummy image for testing at {sample_image_dir}...")
+        os.makedirs(sample_image_dir, exist_ok=True)
+        dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
+        dummy_img = Image.new('RGB', (60, 30), color = 'red')
+        dummy_img.save(dummy_image_path)
+        sample_image_paths = [dummy_image_path]
+    else:
+        sample_image_paths = [os.path.join(sample_image_dir, f) for f in os.listdir(sample_image_dir) if f.endswith(('.jpg', '.png'))]
+        if not sample_image_paths: # Nếu thư mục có nhưng không có ảnh
+            print(f"No images found in {sample_image_dir}. Please ensure sample video was processed.")
+            dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
+            dummy_img = Image.new('RGB', (60, 30), color = 'red')
+            dummy_img.save(dummy_image_path)
+            sample_image_paths = [dummy_image_path]
+    print(f"Using {len(sample_image_paths)} sample images: {sample_image_paths[:2]}...")
+    embeddings = model.get_embeddings(sample_image_paths)
+    print(f"Number of embeddings: {len(embeddings)}")
+    if embeddings:
+        print(f"Dimension of embeddings: {len(embeddings[0])}")
+        print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
+        # Nếu có đủ ảnh, thử so sánh 2 ảnh đầu
+        if len(embeddings) > 1:
+            from sklearn.metrics.pairwise import cosine_similarity
+            sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
+            print(f"Similarity between image 1 and 2: {sim:.4f}")

core/embeddings/text_embedding_model.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from typing import List
+from sentence_transformers import SentenceTransformer
+from utils.logger import logger
+from config.model_configs import TEXT_EMBEDDING_MODEL
+class TextEmbeddingModel:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Loading Text Embedding Model '{TEXT_EMBEDDING_MODEL}' to device: {self.device}")
+        self.model = SentenceTransformer(TEXT_EMBEDDING_MODEL, device=self.device)
+        logger.info("Text Embedding Model loaded successfully.")
+    def get_embeddings(self, texts: List[str]) -> List[List[float]]:
+        if not texts:
+            return []
+        embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
+        logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
+        return embeddings
+# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
+if __name__ == "__main__":
+    model = TextEmbeddingModel()
+    sample_texts = [
+        "This is a test sentence.",
+        "Another sentence for embedding.",
+        "How about some natural language processing?",
+        "Xe hơi màu đỏ đang chạy trên đường phố." # Thử với tiếng Việt
+    ]
+    embeddings = model.get_embeddings(sample_texts)
+    print(f"Number of embeddings: {len(embeddings)}")
+    if embeddings:
+        print(f"Dimension of embeddings: {len(embeddings[0])}")
+        print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
+        # Bạn có thể thử tính cosine similarity giữa các embedding ở đây để thấy độ tương đồng
+        from sklearn.metrics.pairwise import cosine_similarity
+        sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
+        print(f"Similarity between text 1 and 2: {sim:.4f}")

core/retrieval/retriever.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# core/retrieval/retriever.py
+import os
+from typing import List, Tuple, Dict, Any, Union
+from utils.logger import logger
+from config.settings import settings
+from qdrant_client import QdrantClient
+from core.embeddings.text_embedding_model import TextEmbeddingModel
+from core.embeddings.image_embedding_model import ImageEmbeddingModel
+from core.embeddings.audio_embedding_model import AudioEmbeddingModel
+from core.retrieval.vector_db_manager import VectorDBManager
+class Retriever:
+    def __init__(self, client: QdrantClient):
+        logger.info("Initializing the Retriever...")
+        # Initialize embedding models
+        self.text_embedder = TextEmbeddingModel()
+        self.image_embedder = ImageEmbeddingModel()
+        self.audio_embedder = AudioEmbeddingModel()
+        logger.info("Embedding models initialized.")
+        qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+        self.client = client
+        logger.info(f"Single Qdrant client initialized, connected to: {qdrant_db_path}")
+        # Initialize vector database
+        text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
+        self.text_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_dim, client=self.client)
+        image_dim = 512
+        self.image_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_dim, client=self.client)
+        audio_dim = 512
+        self.audio_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=audio_dim, client=self.client)
+        logger.info("VectorDB Managers connected to Qdrant collections.")
+        logger.info(f"Text collection ('{self.text_db_manager.collection_name}') contains {self.text_db_manager.get_total_vectors()} vectors.")
+        logger.info(f"Image collection ('{self.image_db_manager.collection_name}') contains {self.image_db_manager.get_total_vectors()} vectors.")
+        logger.info(f"Audio collection ('{self.audio_db_manager.collection_name}') contains {self.audio_db_manager.get_total_vectors()} vectors.")
+    def retrieve(self, query: Union[str, bytes], query_type: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        logger.info(f"Received retrieval request. Query type: '{query_type}', Top K: {top_k}")
+        embedding = None
+        db_manager_to_use = None
+        # create embeddings
+        try:
+            if query_type == "text":
+                if not isinstance(query, str):
+                    raise TypeError("Text query must be a string.")
+                embedding = self.text_embedder.get_embeddings(query)
+                db_manager_to_use = self.text_db_manager
+            elif query_type == "image":
+                if not isinstance(query, str) or not os.path.exists(query):
+                    raise TypeError("Image query must be a valid file path.")
+                embedding = self.image_embedder.get_embeddings([query])[0]
+                db_manager_to_use = self.image_db_manager
+            elif query_type == "audio":
+                if not isinstance(query, str) or not os.path.exists(query):
+                    raise TypeError("Audio query must be a valid file path.")
+                embedding = self.audio_embedder.get_embeddings([query])[0]
+                db_manager_to_use = self.audio_db_manager
+            else:
+                logger.error(f"Unsupported query type: {query_type}")
+                return []
+        except Exception as e:
+            logger.error(f"Error generating embedding for query: {e}")
+            return []
+        if embedding is None:
+            logger.warning("Could not generate embedding for the query.")
+            return []
+        # searching vectors
+        try:
+            search_results = db_manager_to_use.search_vectors(embedding, k=top_k)
+        except Exception as e:
+            logger.error(f"Error searching in vector database: {e}")
+            return []
+        formatted_results = []
+        for score, payload in search_results:
+            formatted_results.append({
+                "score": score,
+                "metadata": payload['metadata'],
+                "content": payload['content']
+            })
+        logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
+        return formatted_results
+    # def _get_content_from_payload(self, payload: Dict):
+    #     chunk_type = payload.get("type")
+    #     if chunk_type == "text":
+    #         return None # Sẽ cải thiện sau
+    #     elif chunk_type == 'image' or chunk_type == "audio":
+    #         return payload.get('chunk_data_path') # Trả về đường dẫn
+    #     return None
+    def is_database_empty(self) -> bool:
+        total_vectors = self.text_db_manager.get_total_vectors() \
+            + self.image_db_manager.get_total_vectors() \
+            + self.audio_db_manager.get_total_vectors()
+        return total_vectors == 0
+if __name__ == "__main__":
+    from config.settings import settings
+    logger.info("--- Running Retriever Standalone Test (Qdrant version) ---")
+    # Kiểm tra xem Qdrant đã có dữ liệu chưa
+    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+    if not os.path.exists(qdrant_db_path):
+         print("\n\nERROR: Qdrant database not found. Please run 'python scripts/ingest_data.py' first to create the database.\n\n")
+    else:
+        retriever = Retriever()
+        # --- 1. Thử truy vấn văn bản ---
+        print("\n--- Testing Text Retrieval ---")
+        text_query = "What is artificial intelligence?"
+        text_results = retriever.retrieve(text_query, query_type="text", top_k=3)
+        print(f"Query: '{text_query}'")
+        for i, result in enumerate(text_results):
+            print(f"  Result {i+1}:")
+            print(f"    Score: {result['score']:.4f}")
+            print(f"    Type: {result['metadata']['type']}")
+            print(f"    Content Preview: {str(result.get('content'))[:200] if result.get('content') else 'N/A'}...")
+            print(f"    Source: {result['metadata']['source_id']}")
+        # --- 2. Thử truy vấn hình ảnh ---
+        print("\n--- Testing Image Retrieval ---")
+        # Lấy một ảnh từ các chunk đã xử lý để làm truy vấn
+        image_to_query = None
+        image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks")
+        if os.path.exists(image_chunks_dir):
+            for root, _, files in os.walk(image_chunks_dir):
+                if files:
+                    image_to_query = os.path.join(root, files[0])
+                    break
+        if image_to_query and os.path.exists(image_to_query):
+            print(f"Using image as query: {image_to_query}")
+            image_results = retriever.retrieve(image_to_query, query_type="image", top_k=3)
+            for i, result in enumerate(image_results):
+                print(f"  Result {i+1}:")
+                print(f"    Score: {result['score']:.4f}")
+                print(f"    Type: {result['metadata']['type']}")
+                print(f"    Content (Paths): {result['content']}")
+                print(f"    Source: {result['metadata']['source_id']}")
+        else:
+            print("Could not find a sample image to test image retrieval.")

core/retrieval/vector_db_manager.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+from typing import List, Tuple, Dict, Any
+from uuid import uuid4
+from qdrant_client import QdrantClient, models
+from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
+from utils.logger import logger
+from config.settings import settings
+class VectorDBManager:
+    def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
+        logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
+        if client:
+            self.client = client
+            logger.info("Using shared Qdrant client instance.")
+        else:
+            logger.warning("No shared Qdrant client provided. Creating a new local instance.")
+            qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+            self.client = QdrantClient(path=qdrant_db_path)
+        self.collection_name = collection_name
+        self.embedding_dim = embedding_dim
+        self.create_collection_if_not_exists()
+    def create_collection_if_not_exists(self):
+        try:
+            collections = self.client.get_collections().collections
+            collection_names = [collection.name for collection in collections]
+            if self.collection_name not in collection_names:
+                logger.info(f"Collection '{self.collection_name}' not found. Creating a new one...")
+                self.client.recreate_collection(
+                    collection_name=self.collection_name,
+                    vectors_config=VectorParams(
+                        size=self.embedding_dim,
+                        distance=Distance.COSINE
+                    )
+                )
+                logger.success(f"Collection '{self.collection_name}' created successfully.")
+            else:
+                logger.info(f"Collection '{self.collection_name}' already exists.")
+        except Exception as e:
+            logger.error(f"Error checking or creating collection '{self.collection_name}': {e}")
+            raise
+    def add_vectors(self, embeddings: List[List[float]], metadatas: List[Dict[str, Any]]):
+        if not embeddings:
+            logger.warning("No embeddings to add. Skipping.")
+            return
+        if len(embeddings) != len(metadatas):
+            logger.error("Number of embeddings and metadatas must match.")
+            raise ValueError("Embeddings and metadatas count mismatch.")
+        points_to_add = []
+        for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)):
+            point_id = str(uuid4())
+            points_to_add.append(
+                PointStruct(
+                    id=point_id,
+                    vector=embedding,
+                    payload=metadata
+                )
+            )
+        try:
+            operation_info = self.client.upsert(
+                collection_name=self.collection_name,
+                wait=True,
+                points=points_to_add
+            )
+            if operation_info.status == UpdateStatus.COMPLETED:
+                logger.debug(f"Successfully upserted {len(points_to_add)} points to collection '{self.collection_name}'.")
+            else:
+                logger.warning(f"Upsert operation finished with status: {operation_info.status}")
+        except Exception as e:
+            logger.error(f"Error upserting points to collection '{self.collection_name}': {e}")
+    def search_vectors(self, query_embedding: List[float], k: int = 5, filter_payload: Dict = None) -> List[Tuple[float, Dict[str, Any]]]:
+        try:
+            search_results = self.client.search(
+                collection_name=self.collection_name,
+                query_vector=query_embedding,
+                query_filter=filter_payload,
+                limit=k,
+                with_payload=True, # include payload in return
+                with_vectors=False # exclude vectors in return
+            )
+            formatted_results = []
+            for scored_point in search_results:
+                score = scored_point.score
+                payload = scored_point.payload
+                formatted_results.append((score, payload))
+            logger.debug(f"Searched for top {k} neighbors. Found {len(formatted_results)} results.")
+            return formatted_results
+        except Exception as e:
+            logger.error(f"Error searching in collection '{self.collection_name}': {e}")
+            return []
+    def get_total_vectors(self) -> int:
+        try:
+            count_result = self.client.count(
+                collection_name=self.collection_name,
+                exact=True # Đếm chính xác
+            )
+            return count_result.count
+        except Exception as e:
+            logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
+            return 0
+# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
+if __name__ == "__main__":
+    import numpy as np
+    # Các thông số cho collection test
+    TEST_COLLECTION_NAME = "my_test_collection"
+    DUMMY_DIM = 128
+    # --- Kiểm tra tạo collection ---
+    print("\n--- Testing Collection Creation ---")
+    db_manager = VectorDBManager(collection_name=TEST_COLLECTION_NAME, embedding_dim=DUMMY_DIM)
+    print(f"Total vectors initially: {db_manager.get_total_vectors()}")
+    # --- Kiểm tra thêm vector và payload ---
+    print("\n--- Testing Add Vectors ---")
+    dummy_embeddings = np.random.rand(10, DUMMY_DIM).tolist()
+    dummy_metadatas = [
+        {"chunk_id": f"dummy_chunk_{i}", "type": "text" if i < 5 else "image", "source_file": "test.txt"}
+        for i in range(10)
+    ]
+    db_manager.add_vectors(dummy_embeddings, dummy_metadatas)
+    print(f"Total vectors after adding: {db_manager.get_total_vectors()}")
+    # --- Kiểm tra tìm kiếm ---
+    print("\n--- Testing Search ---")
+    dummy_query = np.random.rand(DUMMY_DIM).tolist()
+    results = db_manager.search_vectors(dummy_query, top_k=3)
+    print(f"Top 3 results (no filter):")
+    for score, payload in results:
+        print(f"  Score: {score:.4f}, Payload: {payload}")
+    # --- Kiểm tra tìm kiếm CÓ LỌC (Pre-filtering) ---
+    print("\n--- Testing Search with Filter ---")
+    filter_condition = models.Filter(
+        must=[
+            models.FieldCondition(
+                key="type", # Lọc theo trường 'type' trong payload
+                match=models.MatchValue(value="image"), # Giá trị phải là 'image'
+            )
+        ]
+    )
+    filtered_results = db_manager.search_vectors(dummy_query, top_k=3, filter_payload=filter_condition)
+    print(f"Top 3 results (filtered for type='image'):")
+    for score, payload in filtered_results:
+        print(f"  Score: {score:.4f}, Payload: {payload}")
+    # --- Dọn dẹp collection test ---
+    print("\n--- Cleaning up test collection ---")
+    db_manager.client.delete_collection(collection_name=TEST_COLLECTION_NAME)
+    print(f"Collection '{TEST_COLLECTION_NAME}' deleted.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,125 @@

+aiofiles==23.2.1
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioread==3.0.1
+certifi==2025.7.14
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+colorama==0.4.6
+contourpy==1.3.3
+cycler==0.12.1
+decorator==4.4.2
+faiss-cpu==1.11.0.post1
+fastapi==0.116.1
+ffmpy==0.6.1
+filelock==3.13.1
+fonttools==4.59.0
+fsspec==2024.6.1
+gradio==5.6.0
+gradio_client==1.4.3
+grpcio==1.74.0
+grpcio-tools==1.74.0
+h11==0.16.0
+h2==4.2.0
+hpack==4.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.34.3
+hyperframe==6.1.0
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+importlib_resources==6.5.2
+inquirerpy==0.3.4
+Jinja2==3.1.4
+joblib==1.5.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.25.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+langchain-core==0.2.43
+langchain-text-splitters==0.2.0
+langsmith==0.1.147
+lazy_loader==0.4
+librosa==0.10.2
+llvmlite==0.44.0
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.10.5
+mdurl==0.1.2
+moviepy==1.0.3
+mpmath==1.3.0
+msgpack==1.1.1
+narwhals==2.0.1
+networkx==3.3
+numba==0.61.2
+numpy==1.26.4
+opencv-python==4.12.0.88
+orjson==3.11.1
+packaging==24.2
+pandas==2.3.1
+pfzy==0.3.4
+pillow==10.4.0
+platformdirs==4.3.8
+pooch==1.8.2
+portalocker==2.10.1
+proglog==0.1.12
+prompt_toolkit==3.0.51
+protobuf==6.31.1
+pycparser==2.22
+pydantic==2.10.6
+pydantic-settings==2.3.4
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-multipart==0.0.12
+pytz==2025.2
+pywin32==311
+PyYAML==6.0.2
+qdrant-client==1.9.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.4
+requests-toolbelt==1.0.0
+rich==14.1.0
+rpds-py==0.26.0
+ruff==0.12.7
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.7.1
+scipy==1.16.1
+semantic-version==2.10.0
+sentence-transformers==5.0.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soxr==0.5.0.post1
+starlette==0.47.2
+sympy==1.13.1
+tenacity==8.5.0
+threadpoolctl==3.6.0
+tokenizers==0.19.1
+tomlkit==0.12.0
+torch==2.6.0+cu124
+torchaudio==2.6.0+cu124
+torchvision==0.21.0+cu124
+tqdm==4.67.1
+transformers==4.41.2
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wcwidth==0.2.13
+websockets==11.0.3
+win32_setctime==1.2.0

scripts/ingest_data.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# scripts/ingest_data.py
+import os
+import json
+from tqdm import tqdm
+from pathlib import Path
+from typing import Dict
+import shutil
+from config.settings import settings
+from utils.logger import logger
+from qdrant_client import QdrantClient
+# Import các Processor (không thay đổi)
+from core.data_processing.text_processor import TextProcessor
+from core.data_processing.audio_processor import AudioProcessor
+# from core.data_processing.video_processor import VideoProcessor
+from core.data_processing.image_processor import ImageProcessor
+# Import các Embedding Model (không thay đổi)
+from core.embeddings.text_embedding_model import TextEmbeddingModel
+from core.embeddings.image_embedding_model import ImageEmbeddingModel
+from core.embeddings.audio_embedding_model import AudioEmbeddingModel
+# Import VectorDBManager phiên bản Qdrant MỚI
+from core.retrieval.vector_db_manager import VectorDBManager
+def walk_through_files(extentions: Dict, raw_dir: str, all_raw_chunks_from_processors, processor):
+    all_files = list(raw_dir.rglob("*"))
+    for filepath in tqdm(all_files, desc="Processing " + raw_dir.name):
+        if filepath.suffix in extentions and filepath.is_file():
+            all_raw_chunks_from_processors.extend(
+                processor.process(str(filepath))
+            )
+def ingest_data_pipeline():
+    logger.info("Starting comprehensive data ingestion pipeline (Chunking + Embedding + Qdrant Indexing)...")
+    # --- 1. Khởi tạo các Processor --- (Không thay đổi)
+    text_processor = TextProcessor(chunk_size=500, chunk_overlap=50)
+    audio_processor = AudioProcessor(min_silence_len=1000, silence_thresh_db=-40, target_sr=16000)
+    image_processor = ImageProcessor()
+    # video_processor = VideoProcessor(chunk_duration_sec=15, frames_per_segment=5)
+    # --- Dọn dẹp các thư mục chunk và Qdrant data cũ ---
+    dirs_to_clean_and_create = [
+        settings.CHUNKS_DIR,
+        settings.METADATA_DIR
+    ]
+    # Thư mục dữ liệu của Qdrant
+    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+    dirs_to_clean_and_create.append(qdrant_db_path)
+    for dir_path in dirs_to_clean_and_create:
+        if os.path.exists(dir_path):
+            shutil.rmtree(dir_path)
+            logger.info(f"Cleaned up old directory: {dir_path}")
+        # Tạo lại các thư mục cho chunking, trừ thư mục qdrant (client sẽ tự tạo)
+        if dir_path != qdrant_db_path:
+            os.makedirs(dir_path, exist_ok=True)
+    logger.info("Output directories and previous Qdrant data are ready for fresh ingestion.")
+    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+    client = QdrantClient(path=qdrant_db_path)
+    logger.info(f"Single Qdrant client initialized for ingestion, connected to: {qdrant_db_path}")
+    all_raw_chunks_from_processors = [] # Chứa tất cả các chunk (bao gồm content và metadata)
+    # --- 2. Chạy Data Processing (Chunking) --- (Không thay đổi)
+    logger.info("--- Phase 1: Processing Raw Data into Chunks ---")
+    # Xử lý Văn bản
+    text_extentions = {".txt"}
+    text_raw_dir = Path(settings.RAW_DATA_DIR) / "texts"
+    walk_through_files(text_extentions, text_raw_dir, all_raw_chunks_from_processors, text_processor)
+    # Xử lý Âm thanh
+    audio_extentions = {".wav", ".mp3"}
+    audio_raw_dir = Path(settings.RAW_DATA_DIR) / "audios"
+    walk_through_files(audio_extentions, audio_raw_dir, all_raw_chunks_from_processors, audio_processor)
+    # process images
+    image_extentions = {".jpg", ".png"}
+    image_raw_dir = Path(settings.RAW_DATA_DIR) / "images"
+    walk_through_files(image_extentions, image_raw_dir, all_raw_chunks_from_processors, image_processor)
+    # Xử lý Video
+    # video_raw_dir = os.path.join(settings.RAW_DATA_DIR, "videos")
+    # for filename in tqdm(os.listdir(video_raw_dir), desc="Processing Video"):
+    #     if filename.endswith((".mp4", ".avi", ".mov")):
+    #         all_raw_chunks_from_processors.extend(video_processor.process_video(os.path.join(video_raw_dir, filename)))
+    logger.info(f"Total raw chunks processed from all sources: {len(all_raw_chunks_from_processors)}")
+    # --- 3. Tạo Embedding và Thêm vào Qdrant ---
+    logger.info("--- Phase 2: Generating Embeddings and Building Qdrant Collections ---")
+    # Khởi tạo các Embedding Model
+    text_embedder = TextEmbeddingModel()
+    image_embedder = ImageEmbeddingModel()
+    audio_embedder = AudioEmbeddingModel()
+    # --- Khởi tạo các VectorDBManager cho Qdrant ---
+    # Lấy kích thước embedding từ model để đảm bảo chính xác
+    text_embedding_dim = text_embedder.model.get_sentence_embedding_dimension()
+    text_vector_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_embedding_dim, client=client)
+    # Kích thước embedding cho image/audio (giả định là 512)
+    image_embedding_dim = 512
+    image_vector_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_embedding_dim, client=client)
+    # video_frame_embedding_dim = 512
+    # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
+    audio_embedding_dim = 512
+    audio_vector_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=image_embedding_dim, client=client)
+    logger.info(f"Initialized Text Qdrant Collection Manager with {text_embedding_dim}D.")
+    logger.info(f"Initialized Image Qdrant Collection Manager with {image_embedding_dim}D.")
+    logger.info(f"Initialized Audio Qdrant Collection Manager with {audio_embedding_dim}D.")
+    # Tạo các batch để thêm vào Qdrant hiệu quả hơn
+    text_embeddings_batch = []
+    text_metadatas_batch = []
+    image_embeddings_batch = []
+    image_metadatas_batch = []
+    # video_frame_embeddings_batch = []
+    # video_frame_metadatas_batch = []
+    audio_embeddings_batch = []
+    audio_metadatas_batch = []
+    BATCH_SIZE = 32 # Thêm 32 điểm một lần
+    for chunk_data in tqdm(all_raw_chunks_from_processors, desc="Generating Embeddings & Populating Qdrant"):
+        chunk_type = chunk_data['metadata']['type']
+        content = chunk_data['content']
+        try:
+            if chunk_type == "text":
+                embedding = text_embedder.get_embeddings([content])[0]
+                text_embeddings_batch.append(embedding)
+                text_metadatas_batch.append(chunk_data)
+            elif chunk_type == "audio":
+                embedding = audio_embedder.get_embeddings([content])[0]
+                audio_embeddings_batch.append(embedding)
+                audio_metadatas_batch.append(chunk_data)
+            elif chunk_type == "image":
+                embedding = image_embedder.get_embeddings([content])[0]
+                image_embeddings_batch.append(embedding)
+                image_metadatas_batch.append(chunk_data)
+            # elif chunk_type == "video_frame":
+            #     if content and isinstance(content, list) and len(content) > 0:
+            #         embedding = image_embedder.get_embeddings([content[0]])[0] # Chỉ nhúng ảnh đầu tiên
+            #         video_frame_embeddings_batch.append(embedding)
+            #         video_frame_metadatas_batch.append(chunk_data['metadata'])
+            # Xử lý batch
+            if len(text_embeddings_batch) >= BATCH_SIZE:
+                text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
+                text_embeddings_batch, text_metadatas_batch = [], [] # Reset batch
+            if len(audio_embeddings_batch) >= BATCH_SIZE:
+                audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
+                audio_embeddings_batch, audio_metadatas_batch = [], [] # Reset batch
+            if len(image_embeddings_batch) >= BATCH_SIZE:
+                image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
+                image_embeddings_batch, image_metadatas_batch = [], [] # Reset batch
+            # if len(video_frame_embeddings_batch) >= BATCH_SIZE:
+            #     video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
+            #     video_frame_embeddings_batch, video_frame_metadatas_batch = [], [] # Reset batch
+        except Exception as e:
+            logger.error(f"Error processing chunk {chunk_data['metadata']['chunk_id']}: {e}")
+    # Thêm các embedding còn lại trong batch cuối cùng
+    if text_embeddings_batch:
+        text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
+    if audio_embeddings_batch:
+        audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
+    if image_embeddings_batch:
+        image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
+    # if video_frame_embeddings_batch:
+    #     video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
+    logger.success("Finished populating Qdrant collections.")
+    logger.info(f"Total vectors in 'text_collection': {text_vector_db_manager.get_total_vectors()}")
+    logger.info(f"Total vectors in 'audio_collection': {audio_vector_db_manager.get_total_vectors()}")
+    logger.info(f"Total vectors in 'image_collection': {image_vector_db_manager.get_total_vectors()}")
+    # logger.info(f"Total vectors in 'video_frame_collection': {video_frame_vector_db_manager.get_total_vectors()}")
+    logger.info("Data ingestion pipeline completed successfully!")
+if __name__ == "__main__":
+    ingest_data_pipeline()

scripts/ingestion.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# core/ingestion/ingestion_service.py
+import os
+import gradio as gr
+from typing import List, Optional, Callable
+from tqdm import tqdm
+from utils.logger import logger
+from config.settings import settings
+from qdrant_client import QdrantClient
+# Import các Processor (không thay đổi)
+from core.data_processing.text_processor import TextProcessor
+from core.data_processing.audio_processor import AudioProcessor
+# from core.data_processing.video_processor import VideoProcessor
+from core.data_processing.image_processor import ImageProcessor
+# Import các Embedding Model (không thay đổi)
+from core.embeddings.text_embedding_model import TextEmbeddingModel
+from core.embeddings.image_embedding_model import ImageEmbeddingModel
+from core.embeddings.audio_embedding_model import AudioEmbeddingModel
+# Import VectorDBManager phiên bản Qdrant MỚI
+from core.retrieval.vector_db_manager import VectorDBManager
+class IngestionService:
+    def __init__(self, client: QdrantClient):
+        """
+        Khởi tạo IngestionService với một QdrantClient được chia sẻ.
+        Phiên bản này không theo dõi trạng thái file.
+        """
+        logger.info("Initializing IngestionService (Stateless)...")
+        self.client = client
+        self.text_processor = TextProcessor()
+        self.image_processor = ImageProcessor()
+        self.audio_processor = AudioProcessor()
+        # self.video_processor = VideoProcessor()
+        self.text_embedder = TextEmbeddingModel()
+        self.image_embedder = ImageEmbeddingModel()
+        self.audio_embedder = AudioEmbeddingModel()
+        text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
+        self.text_db_manager = VectorDBManager(
+            client=self.client,
+            collection_name="text_collection",
+            embedding_dim=text_dim
+        )
+        image_embedding_dim = 512
+        self.image_vector_db_manager = VectorDBManager(
+            client=self.client,
+            collection_name="image_collection",
+            embedding_dim=image_embedding_dim
+        )
+        audio_embedding_dim = 512
+        self.audio_vector_db_manager = VectorDBManager(
+            client=self.client,
+            collection_name="audio_collection",
+            embedding_dim=audio_embedding_dim
+        )
+        # video_frame_embedding_dim = 512
+        # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
+        logger.info("IngestionService initialized successfully.")
+    def ingest_files(self, file_paths: List[str]):
+        """
+        Xử lý một danh sách các file, tạo embedding, và thêm vào Qdrant.
+        Hàm này giả định các file đã được đặt vào đúng thư mục trong 'raw'.
+        """
+        return self.ingest_files_with_progress(file_paths, None)
+    def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
+        """
+        Xử lý một danh sách các file với progress tracking.
+        """
+        logger.info(f"Starting ingestion for {len(file_paths)} files...")
+        if progress_callback:
+            progress_callback(0.4, desc="Starting file processing...")
+        all_chunks_to_process = []
+        # 1. Quét qua các file và tạo chunk
+        for i, file_path in enumerate(file_paths):
+            base_progress = 0.4 + (i / len(file_paths)) * 0.3  # 40% -> 70%
+            file_name = os.path.basename(file_path)
+            if progress_callback:
+                progress_callback(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
+            # Xác định loại dữ liệu dựa trên phần mở rộng file
+            file_ext = os.path.splitext(file_path)[1].lower()
+            data_type = None
+            try:
+                if progress_callback:
+                    progress_callback(base_progress + 0.01, desc=f"Reading {file_name}...")
+                if file_ext in ['.txt']:
+                    data_type = 'text'
+                    chunks = self.text_processor.process(file_path)
+                elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
+                    data_type = 'image'
+                    chunks = self.image_processor.process(file_path)
+                elif file_ext in ['.wav', '.mp3']:
+                    data_type = 'audio'
+                    chunks = self.audio_processor.process(file_path)
+                # elif file_ext in ['.mp4', '.avi', '.mov']:
+                #     data_type = 'video'
+                #     chunks = self.video_processor.process_video(file_path)
+                else:
+                    logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
+                    continue
+                if progress_callback:
+                    progress_callback(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
+                all_chunks_to_process.extend(chunks)
+            except Exception as e:
+                logger.error(f"Error processing file {file_path}: {e}")
+                continue
+        if not all_chunks_to_process:
+            logger.warning("No processable chunks were generated from the provided files.")
+            return
+        logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
+        if progress_callback:
+            progress_callback(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
+        # 2. Tạo embedding và thêm vào Qdrant (theo batch)
+        text_embeddings_batch, text_metadatas_batch = [], []
+        audio_embeddings_batch, audio_metadatas_batch = [], []
+        image_embeddings_batch, image_metadatas_batch = [], []
+        BATCH_SIZE = 32
+        for i, chunk_data in enumerate(all_chunks_to_process):
+            # Tính toán progress chi tiết hơn
+            base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25  # 70% -> 95%
+            chunk_type = chunk_data['metadata']['type']
+            content = chunk_data['content']
+            chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
+            try:
+                if progress_callback:
+                    progress_callback(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
+                embedding = None
+                if chunk_type == "text":
+                    if progress_callback:
+                        progress_callback(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
+                    embedding = self.text_embedder.get_embeddings([content])[0]
+                    text_embeddings_batch.append(embedding)
+                    text_metadatas_batch.append(chunk_data)
+                elif chunk_type == "audio":
+                    if progress_callback:
+                        progress_callback(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
+                    embedding = self.audio_embedder.get_embeddings([content])[0]
+                    audio_embeddings_batch.append(embedding)
+                    audio_metadatas_batch.append(chunk_data)
+                elif chunk_type == "image":
+                    if progress_callback:
+                        progress_callback(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
+                    embedding = self.image_embedder.get_embeddings([content])[0]
+                    image_embeddings_batch.append(embedding)
+                    image_metadatas_batch.append(chunk_data)
+                # Thêm batch khi đủ kích thước với progress update
+                if len(text_embeddings_batch) >= BATCH_SIZE:
+                    if progress_callback:
+                        progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
+                    self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
+                    text_embeddings_batch, text_metadatas_batch = [], []
+                if len(audio_embeddings_batch) >= BATCH_SIZE:
+                    if progress_callback:
+                        progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
+                    self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
+                    audio_embeddings_batch, audio_metadatas_batch = [], []
+                if len(image_embeddings_batch) >= BATCH_SIZE:
+                    if progress_callback:
+                        progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
+                    self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
+                    image_embeddings_batch, image_metadatas_batch = [], []
+            except Exception as e:
+                logger.error(f"Error ingesting chunk {chunk_id}: {e}")
+        if progress_callback:
+            progress_callback(0.95, desc="Saving final batches...")
+        # Thêm các embedding còn lại trong batch cuối cùng
+        final_operations = []
+        if text_embeddings_batch:
+            final_operations.append(("text", len(text_embeddings_batch)))
+        if audio_embeddings_batch:
+            final_operations.append(("audio", len(audio_embeddings_batch)))
+        if image_embeddings_batch:
+            final_operations.append(("image", len(image_embeddings_batch)))
+        for i, (batch_type, count) in enumerate(final_operations):
+            current_progress = 0.95 + (i / len(final_operations)) * 0.04  # 95% -> 99%
+            if batch_type == "text" and text_embeddings_batch:
+                if progress_callback:
+                    progress_callback(current_progress, desc=f"Saving final {count} text embeddings...")
+                self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
+            elif batch_type == "audio" and audio_embeddings_batch:
+                if progress_callback:
+                    progress_callback(current_progress, desc=f"Saving final {count} audio embeddings...")
+                self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
+            elif batch_type == "image" and image_embeddings_batch:
+                if progress_callback:
+                    progress_callback(current_progress, desc=f"Saving final {count} image embeddings...")
+                self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
+        if progress_callback:
+            progress_callback(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
+        logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")

utils/logger.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import sys
+from loguru import logger
+from config.settings import settings # Import settings của chúng ta
+# Cấu hình logger
+logger.remove() # Gỡ bỏ cấu hình mặc định
+logger.add(
+    "logs/file_{time}.log", # Lưu log vào tệp, với tên tệp theo thời gian
+    rotation="10 MB",     # Xoay tệp log khi đạt 10MB
+    compression="zip",    # Nén tệp log cũ
+    level=settings.LOG_LEVEL, # Mức độ log từ settings
+    colorize=True,        # Tô màu output trên console
+    format="{time} {level} {message}",
+    enqueue=True # Sử dụng hàng đợi để ghi log không chặn (quan trọng cho các ứng dụng đa luồng/async)
+)
+logger.add(
+    sys.stderr, # Ghi log ra console
+    level=settings.LOG_LEVEL,
+    colorize=True,
+    format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
+)
+# Xuất logger để các module khác có thể import và sử dụng
+__all__ = ["logger"]