3v324v23 commited on
Commit
fcb8b13
·
1 Parent(s): 3966beb

Add application file

Browse files
.gitignore ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/main.py
2
+ import gradio as gr
3
+ import os
4
+ import sys
5
+ import shutil
6
+ import zipfile
7
+ from typing import List, Dict, Any
8
+ from pathlib import Path
9
+
10
+ # Thêm thư mục gốc của dự án vào Python Path để có thể import các module
11
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
12
+ if project_root not in sys.path:
13
+ sys.path.insert(0, project_root)
14
+
15
+ from utils.logger import logger
16
+ from config.settings import settings
17
+ from qdrant_client import QdrantClient
18
+ from core.retrieval.retriever import Retriever
19
+ from scripts.ingestion import IngestionService
20
+
21
+ # --- 1. Khởi tạo các dịch vụ toàn cục ---
22
+ logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
23
+ try:
24
+ # Tạo MỘT QdrantClient duy nhất để chia sẻ
25
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
26
+ shared_qdrant_client = QdrantClient(path=qdrant_db_path)
27
+ logger.info("Shared Qdrant client initialized.")
28
+
29
+ # Khởi tạo các dịch vụ, chia sẻ client
30
+ ingestion_service = IngestionService(client=shared_qdrant_client)
31
+ retriever_instance = Retriever(client=shared_qdrant_client)
32
+
33
+ logger.info("All services initialized successfully.")
34
+ except Exception as e:
35
+ logger.error(f"Failed to initialize global services: {e}")
36
+ raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
37
+
38
+
39
+ # ---- HÀM XỬ LÝ CHO TAB UPLOAD ----
40
+ def upload_handler(zip_path: str, progress=gr.Progress()):
41
+ """
42
+ Hàm này xử lý việc upload file và thư mục với progress bar.
43
+ """
44
+ progress(0, desc="🚀 Starting upload process...")
45
+
46
+ if not zip_path:
47
+ return "Error: No file uploaded"
48
+
49
+ if not zip_path.endswith(".zip"):
50
+ return "Error: Please upload a zip file"
51
+
52
+ progress(0.05, desc="📦 Extracting ZIP file...")
53
+
54
+ try:
55
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
56
+ zip_ref.extractall(settings.RAW_DATA_DIR)
57
+ except zipfile.BadZipFile:
58
+ return "Invalid ZIP file."
59
+ except Exception as e:
60
+ return f"Error during extraction: {str(e)}"
61
+
62
+ progress(0.15, desc="🔍 Scanning for files...")
63
+
64
+ logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
65
+
66
+ # --- Bước 1: Thu thập tất cả các đường dẫn file từ input ---
67
+ path = Path(settings.RAW_DATA_DIR)
68
+ all_temp_file_paths = list(path.rglob("*"))
69
+ all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
70
+
71
+ progress(0.25, desc="📊 Analyzing files...")
72
+
73
+ if not all_temp_file_paths:
74
+ return "No valid files found in the uploaded items."
75
+
76
+ logger.info(f"Found a total of {len(all_temp_file_paths)} files to process.")
77
+ progress(0.35, desc=f"✅ Found {len(all_temp_file_paths)} files to process...")
78
+
79
+ files_to_ingest = all_temp_file_paths.copy()
80
+
81
+ if not files_to_ingest:
82
+ return "No valid files were moved for ingestion."
83
+
84
+ # --- Bước 3: Gọi dịch vụ để nạp tất cả các file một lần ---
85
+ try:
86
+ progress(0.4, desc="🔄 Starting file ingestion...")
87
+ # Gọi hàm ingestion với progress callback
88
+ ingestion_service.ingest_files_with_progress(files_to_ingest)
89
+
90
+ success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
91
+ logger.success(success_message)
92
+ return success_message
93
+ except Exception as e:
94
+ error_message = f"An error occurred during the ingestion process: {e}"
95
+ logger.error(error_message)
96
+ return error_message
97
+
98
+ # ---- HÀM XỬ LÝ CHO TAB SEARCH ----
99
+ def search_handler(text_query: str, image_query_path: str, audio_query_path: str, top_k: int):
100
+ def create_empty_updates(max_results=10):
101
+ updates = []
102
+ for _ in range(max_results):
103
+ # Chỉ 5 components cho mỗi result: group, markdown, textbox, image, audio
104
+ updates.extend([
105
+ gr.Group(visible=False),
106
+ gr.Markdown(visible=False),
107
+ gr.Textbox(visible=False),
108
+ gr.Image(visible=False),
109
+ gr.Audio(visible=False)
110
+ ])
111
+ return updates
112
+
113
+ # Kiểm tra database trước khi xử lý query
114
+ try:
115
+ if retriever_instance.is_database_empty():
116
+ empty_db_message = gr.Textbox(
117
+ value="Database is empty. Please go to the 'Upload Data' tab to add files first.",
118
+ visible=True
119
+ )
120
+ return [empty_db_message] + create_empty_updates()
121
+ except Exception as e:
122
+ error_message = gr.Textbox(
123
+ value=f"Error checking database: {str(e)}",
124
+ visible=True
125
+ )
126
+ return [error_message] + create_empty_updates()
127
+
128
+ query_type, query_content = None, None
129
+ if text_query and text_query.strip():
130
+ query_type, query_content = "text", text_query
131
+ elif image_query_path:
132
+ query_type, query_content = "image", image_query_path
133
+ elif audio_query_path:
134
+ query_type, query_content = "audio", audio_query_path
135
+
136
+ max_results = 10 # Phải khớp với số component đã tạo
137
+
138
+ if not query_type:
139
+ return [gr.Textbox(value="Error: Please provide a query.", visible=True)] + create_empty_updates()
140
+
141
+ try:
142
+ logger.info(f"Handling '{query_type}' query: {query_content}")
143
+ results = retriever_instance.retrieve(query_content, query_type, int(top_k))
144
+
145
+ if not results:
146
+ return [gr.Textbox(value="No results found.", visible=True)] + create_empty_updates()
147
+
148
+ output_updates = [gr.Textbox(value="", visible=False)] # Ẩn ô info_box
149
+ for i in range(max_results):
150
+ if i < len(results):
151
+ res = results[i]
152
+ score, metadata, content = res['score'], res['metadata'], res.get('content')
153
+ chunk_type, source_id = metadata.get('type', 'N/A'), metadata.get('source_id', 'N/A')
154
+ info_text = f"### Result {i + 1} (Score: {score:.4f})\n**Type:** `{chunk_type}` | **Source:** `{source_id}`"
155
+
156
+ text_val, text_visible = "", False
157
+ img_val, img_visible = None, False
158
+ audio_val, audio_visible = None, False
159
+
160
+ if chunk_type == 'text':
161
+ text_val, text_visible = content, True
162
+ elif chunk_type == "image":
163
+ if content and os.path.exists(content):
164
+ img_val, img_visible = content, True
165
+ else:
166
+ text_val, text_visible = "`Image content not found at path.`", True
167
+ elif chunk_type == 'audio':
168
+ if content and os.path.exists(content):
169
+ audio_val, audio_visible = content, True
170
+ else:
171
+ text_val, text_visible = "`Audio content not found at path.`", True
172
+
173
+ # Chỉ 5 components cho mỗi result
174
+ output_updates.extend([
175
+ gr.Group(visible=True),
176
+ gr.Markdown(value=info_text, visible=True),
177
+ gr.Textbox(value=text_val, visible=text_visible),
178
+ gr.Image(value=img_val, visible=img_visible),
179
+ gr.Audio(value=audio_val, visible=audio_visible)
180
+ ])
181
+ else:
182
+ # Chỉ 5 components cho mỗi result
183
+ output_updates.extend([
184
+ gr.Group(visible=False),
185
+ gr.Markdown(visible=False),
186
+ gr.Textbox(visible=False),
187
+ gr.Image(visible=False),
188
+ gr.Audio(visible=False)
189
+ ])
190
+
191
+ return output_updates
192
+
193
+ except Exception as e:
194
+ error_message = f"Error during search: {str(e)}"
195
+ logger.error(error_message)
196
+ return [gr.Textbox(value=error_message, visible=True)] + create_empty_updates()
197
+
198
+ # --- 3. Xây dựng giao diện với Gradio Blocks ---
199
+ def create_and_run_app():
200
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multimedia RAG Assistant") as demo:
201
+ gr.Markdown("# Multimedia RAG Assistant")
202
+
203
+ with gr.Tabs() as tabs:
204
+ # --- TAB 1: SEARCH ---
205
+ with gr.TabItem("Search Database", id=0):
206
+ with gr.Row():
207
+ with gr.Column(scale=1):
208
+ gr.Markdown("### Input Query")
209
+ text_query_input = gr.Textbox(label="Text Query", placeholder="e.g., a dog playing in a park")
210
+ image_query_input = gr.Image(label="Image Query", type="filepath")
211
+ audio_query_input = gr.Audio(label="Audio Query", type="filepath")
212
+ top_k_slider = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Top K Results")
213
+ search_button = gr.Button("Search", variant="primary")
214
+
215
+ with gr.Column(scale=2):
216
+ gr.Markdown("### Retrieval Results")
217
+ info_box = gr.Textbox(label="Info", interactive=False, visible=False)
218
+ max_results = 10
219
+ result_components = []
220
+
221
+ for i in range(max_results):
222
+ with gr.Group(visible=False) as result_group:
223
+ result_info = gr.Markdown()
224
+ result_text = gr.Textbox(label="Text Content", interactive=False, visible=False)
225
+ result_image = gr.Image(label="Image Content", interactive=False, visible=False)
226
+ result_audio = gr.Audio(label="Audio Content", visible=False, type="filepath")
227
+ # Chỉ thêm 5 components cho mỗi result
228
+ result_components.extend([result_group, result_info, result_text, result_image, result_audio])
229
+
230
+ all_outputs = [info_box] + result_components
231
+ search_button.click(
232
+ fn=search_handler,
233
+ inputs=[text_query_input, image_query_input, audio_query_input, top_k_slider],
234
+ outputs=all_outputs
235
+ )
236
+
237
+ # --- TAB 2: UPLOAD ---
238
+ with gr.TabItem("Upload Data", id=1):
239
+ gr.Markdown("### Upload New Data to the Database")
240
+ gr.Markdown("You can upload multiple files of different types at once (text, images, audio), or drop a folder.")
241
+ with gr.Column():
242
+ upload_file_input = gr.File(
243
+ label="Upload ZIP file containing your data",
244
+ file_types=[".zip"],
245
+ file_count="single",
246
+ type="filepath"
247
+ )
248
+ upload_button = gr.Button("Upload and Ingest", variant="primary")
249
+ upload_status = gr.Textbox(label="Status", interactive=False, placeholder="Upload status will be shown here...")
250
+
251
+ upload_button.click(
252
+ fn=upload_handler,
253
+ inputs=[upload_file_input],
254
+ outputs=[upload_status],
255
+ show_progress="full" # Hiển thị progress bar
256
+ )
257
+
258
+ # Xử lý sự kiện để xóa các input khác trong tab Search
259
+ def clear_search_inputs(input_type):
260
+ if input_type == 'text': return gr.Image(value=None), gr.Audio(value=None)
261
+ elif input_type == 'image': return gr.Textbox(value=""), gr.Audio(value=None)
262
+ elif input_type == 'audio': return gr.Textbox(value=""), gr.Image(value=None)
263
+
264
+ text_query_input.change(lambda: clear_search_inputs('text'), outputs=[image_query_input, audio_query_input], queue=False)
265
+ image_query_input.change(lambda: clear_search_inputs('image'), outputs=[text_query_input, audio_query_input], queue=False)
266
+ audio_query_input.change(lambda: clear_search_inputs('audio'), outputs=[text_query_input, image_query_input], queue=False)
267
+
268
+ return demo
269
+
270
+ # --- 4. Chạy ứng dụng ---
271
+ if __name__ == "__main__":
272
+ logger.info("Launching Gradio interface...")
273
+ demo = create_and_run_app()
274
+ demo.launch()
config/database_configs.py ADDED
File without changes
config/model_configs.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config/model_configs.py
2
+
3
+ # Embedding Models
4
+ TEXT_EMBEDDING_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
5
+ IMAGE_EMBEDDING_MODEL: str = "openai/clip-vit-base-patch32" # Hoặc các mô hình CLIP khác
6
+ AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused" # Ví dụ về mô hình CLAP
7
+
8
+ # Generator Model (LLM/LMM)
9
+ GENERATOR_MODEL_NAME: str = "gpt-4o" # Hoặc "google/gemma-2b", "meta-llama/Llama-2-7b-chat-hf", "llava-hf/llava-1.5-7b-hf"
10
+ GENERATOR_MODEL_MAX_TOKENS: int = 4096
11
+ GENERATOR_MODEL_TEMPERATURE: float = 0.7
12
+
13
+ # Reranker Model
14
+ RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
15
+
16
+ # Automatic Speech Recognition (ASR) Model (Ví dụ với Whisper của Hugging Face)
17
+ ASR_MODEL: str = "openai/whisper-tiny" # Có thể dùng "base", "small", "medium" tùy tài nguyên GPU
config/settings.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings, SettingsConfigDict
2
+ import os
3
+ from typing import Optional
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ class Settings(BaseSettings):
9
+ APP_NAME: str = "Multimedia RAG Assistant"
10
+ APP_VERSION: str = "0.1.0"
11
+ ENVIRONMENT: str = os.getenv('ENVIRONMENT')
12
+
13
+ DATA_DIR: str = "data"
14
+ RAW_DATA_DIR: str = os.path.join("data", "raw")
15
+ PROCESSED_DATA_DIR: str = os.path.join("data", "processed")
16
+ CHUNKS_DIR: str = os.path.join("data", "processed", "chunks")
17
+ METADATA_DIR: str = os.path.join("data", "processed", "metadata")
18
+ EMBEDDINGS_DIR: str = os.path.join("data", "processed", "embeddings")
19
+
20
+ API_HOST: str = "0.0.0.0"
21
+ API_PORT: int = 8000
22
+
23
+ # Cấu hình mô hình
24
+ # Đây là nơi bạn sẽ thêm các API key hoặc model IDs sau này
25
+ HUGGINGFACE_API_KEY: Optional[str] = os.getenv('HUGGINGFACE_API_KEY') # Ví dụ: Nếu dùng Hugging Face models
26
+
27
+ # Cấu hình logger
28
+ LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
29
+
30
+ model_config = SettingsConfigDict(
31
+ env_file=".env",
32
+ extra="ignore",
33
+ case_sensitive=True
34
+ )
35
+
36
+ settings = Settings()
core/data_processing/audio_processor.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/data_processing/audio_processor.py
2
+ import os
3
+
4
+ from typing import List, Dict, Any
5
+ from utils.logger import logger
6
+ from pydub import AudioSegment
7
+ from pydub.silence import split_on_silence
8
+ from config.settings import settings
9
+
10
+ class AudioProcessor:
11
+ def __init__(self, min_silence_len: int = 1000, silence_thresh_db: int = -40, target_sr: int = 16000):
12
+ self.min_silence_len = min_silence_len
13
+ self.silence_thresh_db = silence_thresh_db
14
+ self.target_sr = target_sr
15
+ logger.info(f"AudioProcessor initialized (min_silence_len={min_silence_len}ms, silence_thresh_db={silence_thresh_db}dB).")
16
+
17
+ def process(self, file_path: str) -> List[Dict[str, Any]]:
18
+ try:
19
+ logger.info(f"Processing audio file: {file_path}")
20
+ audio = AudioSegment.from_file(file_path)
21
+
22
+ if audio.frame_rate != self.target_sr:
23
+ audio = audio.set_frame_rate(self.target_sr)
24
+
25
+ audio_segments = split_on_silence(
26
+ audio,
27
+ min_silence_len=self.min_silence_len,
28
+ silence_thresh=self.silence_thresh_db,
29
+ keep_silence=500
30
+ )
31
+
32
+ chunks = []
33
+ audio_chunks_dir = os.path.join(settings.CHUNKS_DIR, "audio")
34
+ os.makedirs(audio_chunks_dir, exist_ok=True)
35
+
36
+ for i, segment in enumerate(audio_segments):
37
+ segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
38
+ chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
39
+
40
+ # Lưu segment thành file WAV tạm thời
41
+ segment.export(chunk_file_path, format="wav")
42
+
43
+ metadata = {
44
+ "source_id": os.path.basename(file_path),
45
+ "type": "audio",
46
+ "chunk_id": segment_id,
47
+ "chunk_data_path": chunk_file_path,
48
+ # "start_time_ms": int(segment.start_time),
49
+ # "end_time_ms": int(segment.end_time),
50
+ "duration_ms": len(segment)
51
+ }
52
+ chunks.append({
53
+ "content": chunk_file_path,
54
+ "metadata": metadata
55
+ })
56
+ logger.info(f"Generated {len(chunks)} audio segments from {file_path}")
57
+ return chunks
58
+ except FileNotFoundError:
59
+ logger.error(f"Audio file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
60
+ return []
61
+ except Exception as e:
62
+ logger.error(f"Error processing audio file {file_path}: {e}")
63
+ return []
64
+
65
+ # Ví dụ sử dụng (giữ nguyên để kiểm tra)
66
+ if __name__ == "__main__":
67
+ sample_audio_path = os.path.join(settings.RAW_DATA_DIR, "audios", "sample_audio.wav")
68
+ if not os.path.exists(sample_audio_path):
69
+ print(f"ERROR: Sample audio not found at {sample_audio_path}. Please create it first.")
70
+ print("Make sure you have ffmpeg installed and available in your PATH for pydub to work.")
71
+ else:
72
+ processor = AudioProcessor()
73
+ audio_chunks = processor.process(sample_audio_path)
74
+
75
+ for i, chunk in enumerate(audio_chunks):
76
+ print(f"\n--- Audio Chunk {i+1} ---")
77
+ print(f"Type: {chunk['metadata']['type']}")
78
+ print(f"Content (path): {chunk['content']}")
79
+ print(f"Metadata: {chunk['metadata']}")
80
+ # Bạn có thể thử mở file chunk['content'] để nghe
core/data_processing/image_processor.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/data_processing/image_processor.py
2
+ from typing import List, Dict, Any
3
+ import os
4
+ from PIL import Image
5
+ from utils.logger import logger
6
+
7
+ class ImageProcessor:
8
+ def __init__(self):
9
+ logger.info("ImageProcessor initialized.")
10
+
11
+ def process(self, file_path: str) -> List[Dict[str, Any]]:
12
+ try:
13
+ logger.debug(f"Processing image file: {file_path}")
14
+
15
+ if not os.path.exists(file_path):
16
+ logger.error(f"Image file not found: {file_path}")
17
+ return []
18
+
19
+ with Image.open(file_path) as img:
20
+ width, height = img.size
21
+ img_format = img.format
22
+
23
+ file_size = os.path.getsize(file_path)
24
+
25
+ # Tạo một ID duy nhất cho chunk này
26
+ # Lấy tên file không bao gồm phần mở rộng
27
+ base_name = os.path.basename(file_path)
28
+ chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
29
+
30
+ metadata = {
31
+ "source_id": base_name,
32
+ "type": "image",
33
+ "chunk_id": chunk_id,
34
+ "chunk_data_path": file_path,
35
+ "image_width": width,
36
+ "image_height": height,
37
+ "image_format": img_format,
38
+ "file_size_bytes": file_size
39
+ }
40
+
41
+ # Tạo chunk
42
+ # Content sẽ là đường dẫn đến file, giống như audio/video segments
43
+ chunk = {
44
+ "content": file_path,
45
+ "metadata": metadata
46
+ }
47
+
48
+ # Trả về một danh sách chứa một chunk duy nhất
49
+ return [chunk]
50
+
51
+ except Exception as e:
52
+ logger.error(f"Error processing image file {file_path}: {e}")
53
+ return []
54
+
55
+ # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
56
+ if __name__ == "__main__":
57
+ from config.settings import settings
58
+ import os
59
+
60
+ # Tạo một ảnh dummy để kiểm tra
61
+ dummy_image_dir = os.path.join(settings.RAW_DATA_DIR, "images")
62
+ os.makedirs(dummy_image_dir, exist_ok=True)
63
+ dummy_image_path = os.path.join(dummy_image_dir, "test_image.jpg")
64
+
65
+ try:
66
+ # Tạo một ảnh mẫu màu xanh
67
+ dummy_img = Image.new('RGB', (100, 150), color = 'blue')
68
+ dummy_img.save(dummy_image_path)
69
+ print(f"Created a dummy image for testing at: {dummy_image_path}")
70
+
71
+ # Khởi tạo processor và xử lý ảnh
72
+ processor = ImageProcessor()
73
+ image_chunks = processor.process(dummy_image_path)
74
+
75
+ if image_chunks:
76
+ print("\n--- Image Chunk Processed ---")
77
+ chunk = image_chunks[0]
78
+ print(f"Content (path): {chunk['content']}")
79
+ print("Metadata:")
80
+ for key, value in chunk['metadata'].items():
81
+ print(f" - {key}: {value}")
82
+ else:
83
+ print("Failed to process the dummy image.")
84
+
85
+ finally:
86
+ # Dọn dẹp ảnh dummy
87
+ if os.path.exists(dummy_image_path):
88
+ os.remove(dummy_image_path)
89
+ print(f"Cleaned up dummy image: {dummy_image_path}")
core/data_processing/text_processor.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import List, Dict, Any
4
+ from utils.logger import logger
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+
7
+ class TextProcessor:
8
+ def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
9
+ self.chunk_size = chunk_size
10
+ self.chunk_overlap = chunk_overlap
11
+ self.text_splitter = RecursiveCharacterTextSplitter(
12
+ chunk_size=self.chunk_size,
13
+ chunk_overlap=self.chunk_overlap,
14
+ length_function=len, # count character, can be replaced
15
+ add_start_index=True #
16
+ )
17
+
18
+ logger.info(f"TextProcessor initialized with LangChain's RecursiveCharacterTextSplitter (chunk_size={chunk_size}, chunk_overlap={chunk_overlap})")
19
+
20
+ def process(self, file_path: str) -> List[Dict[str, Any]]:
21
+ try:
22
+ with open(file_path, "r", encoding="utf-8") as f:
23
+ text = f.read()
24
+ logger.info(f"Processing text document: {file_path}")
25
+ split_texts = self.text_splitter.split_text(text)
26
+
27
+ chunks = []
28
+ for i, chunk_content in enumerate(split_texts):
29
+ start_char_idx = text.find(chunk_content) # find start index of each chunk_content
30
+ chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
31
+ metadata = {
32
+ "source_id": os.path.basename(file_path),
33
+ "type": "text",
34
+ "chunk_id": chunk_id,
35
+ "start_char_index": start_char_idx, # Vị trí ký tự bắt đầu
36
+ "end_char_index": start_char_idx + len(chunk_content), # Vị trí ký tự kết thúc
37
+ "content_length": len(chunk_content)
38
+ }
39
+ chunks.append({
40
+ "content": chunk_content,
41
+ "metadata": metadata
42
+ })
43
+ logger.info(f"Generated {len(chunks)} text chunks from {file_path}")
44
+ return chunks
45
+ except Exception as e:
46
+ logger.error(f"Error processing text document {file_path}: {e}")
47
+ return []
48
+
49
+ # Ví dụ sử dụng (giữ nguyên để kiểm tra)
50
+ if __name__ == "__main__":
51
+ from config.settings import settings
52
+ import os
53
+
54
+ sample_doc_path = os.path.join(settings.RAW_DATA_DIR, "documents", "sample_document.txt")
55
+ if not os.path.exists(sample_doc_path):
56
+ print(f"ERROR: Sample document not found at {sample_doc_path}. Please create it first.")
57
+ else:
58
+ processor = TextProcessor(chunk_size=100, chunk_overlap=20) # Thử kích thước nhỏ hơn để thấy rõ chunk
59
+ text_chunks = processor.process(sample_doc_path)
60
+
61
+ for i, chunk in enumerate(text_chunks): # In tất cả các chunk để kiểm tra
62
+ print(f"\n--- Chunk {i+1} ---")
63
+ print(f"Content: {chunk['content']}") # In toàn bộ nội dung chunk
64
+ print(f"Metadata: {chunk['metadata']}")
core/data_processing/video_processor.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/data_processing/video_processor.py
2
+ import os
3
+ import torch
4
+ import cv2
5
+ import numpy as np
6
+
7
+ from typing import List, Dict, Any
8
+ from utils.logger import logger
9
+ from moviepy.editor import VideoFileClip
10
+ from config.settings import settings
11
+
12
+ class VideoProcessor:
13
+ def __init__(self, chunk_duration_sec: int = 10, frames_per_segment: int = 5):
14
+ self.chunk_duration_sec = chunk_duration_sec
15
+ self.frames_per_segment = frames_per_segment
16
+ logger.info(f"VideoProcessor initialized (chunk_duration={chunk_duration_sec}s, frames_per_segment={frames_per_segment}).")
17
+
18
+ def process_video(self, file_path: str) -> List[Dict[str, Any]]:
19
+ try:
20
+ logger.info(f"Processing video file: {file_path}")
21
+ video_clip = VideoFileClip(file_path)
22
+ total_duration = video_clip.duration # Tổng thời lượng video (giây)
23
+
24
+ all_chunks = []
25
+
26
+ # Tạo thư mục con để lưu các frame/ảnh tạm thời
27
+ image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks", os.path.basename(file_path).split('.')[0])
28
+ os.makedirs(image_chunks_dir, exist_ok=True)
29
+
30
+ # Tạo thư mục con để lưu các video segment tạm thời
31
+ video_segments_dir = os.path.join(settings.CHUNKS_DIR, "video/video_segments", os.path.basename(file_path).split('.')[0])
32
+ os.makedirs(video_segments_dir, exist_ok=True)
33
+
34
+ current_time = 0.0
35
+ chunk_idx = 0
36
+
37
+ while current_time < total_duration:
38
+ end_time = min(current_time + self.chunk_duration_sec, total_duration) # end time of each segment
39
+ segment_clip = video_clip.subclip(current_time, end_time)
40
+
41
+ segment_base_name = f"{os.path.basename(file_path).split('.')[0]}_segment_{chunk_idx}"
42
+
43
+ frames_paths = []
44
+
45
+ frame_timestamps = np.linspace(0, segment_clip.duration, self.frames_per_segment + 2)[1:-1]
46
+
47
+ for ts in frame_timestamps:
48
+ frame = segment_clip.get_frame(ts)
49
+ frame_filename = f"{segment_base_name}_frame_{int(ts*1000)}.jpg"
50
+ frame_path = os.path.join(image_chunks_dir, frame_filename)
51
+ cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
52
+ frames_paths.append(frame_path)
53
+
54
+ # Tạo chunk cho các khung hình
55
+ image_chunk_id = f"{segment_base_name}_image"
56
+ all_chunks.append({
57
+ "content": frames_paths, # Danh sách đường dẫn đến các file ảnh
58
+ "metadata": {
59
+ "source_id": os.path.basename(file_path),
60
+ "type": "video_frame", # Loại chunk
61
+ "chunk_id": image_chunk_id,
62
+ "start_time_sec": current_time,
63
+ "end_time_sec": end_time,
64
+ "frame_paths": frames_paths # Lưu lại đường dẫn trong metadata
65
+ }
66
+ })
67
+
68
+ # 2. Lưu đoạn video clip (optional, nhưng hữu ích cho video retrieval)
69
+ video_segment_path = os.path.join(video_segments_dir, f"{segment_base_name}.mp4")
70
+ segment_clip.write_videofile(video_segment_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
71
+
72
+ # Tạo chunk cho video segment
73
+ video_chunk_id = f"{segment_base_name}_video_clip"
74
+ all_chunks.append({
75
+ "content": video_segment_path, # Đường dẫn đến file video clip
76
+ "metadata": {
77
+ "source_id": os.path.basename(file_path),
78
+ "type": "video_segment_clip", # Loại chunk mới: video clip
79
+ "chunk_id": video_chunk_id,
80
+ "start_time_sec": current_time,
81
+ "end_time_sec": end_time,
82
+ "chunk_data_path": video_segment_path # Lưu lại đường dẫn trong metadata
83
+ }
84
+ })
85
+
86
+ current_time = end_time
87
+ chunk_idx += 1
88
+
89
+ video_clip.close() # Đảm bảo giải phóng tài nguyên
90
+ logger.info(f"Generated {len(all_chunks)} chunks (frames & video segments) from video {file_path}")
91
+ return all_chunks
92
+ except FileNotFoundError:
93
+ logger.error(f"Video file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
94
+ return []
95
+ except Exception as e:
96
+ logger.error(f"Error processing video file {file_path}: {e}")
97
+ return []
98
+
99
+ # Ví dụ sử dụng (giữ nguyên để kiểm tra)
100
+ if __name__ == "__main__":
101
+ sample_video_path = os.path.join(settings.RAW_DATA_DIR, "videos", "sample_video.mp4")
102
+ if not os.path.exists(sample_video_path):
103
+ print(f"ERROR: Sample video not found at {sample_video_path}. Please create it first.")
104
+ print("Make sure you have ffmpeg installed and available in your PATH for moviepy to work.")
105
+ else:
106
+ processor = VideoProcessor(chunk_duration_sec=5, frames_per_segment=3)
107
+ video_chunks = processor.process_video(sample_video_path)
108
+
109
+ for i, chunk in enumerate(video_chunks):
110
+ print(f"\n--- Video Chunk {i+1} ---")
111
+ print(f"Type: {chunk['metadata']['type']}")
112
+ if chunk['metadata']['type'] == 'video_frames':
113
+ print(f"Content (paths): {chunk['content']}")
114
+ if chunk['content']:
115
+ print(f"Sample frame: {chunk['content'][0]}")
116
+ elif chunk['metadata']['type'] == 'video_segment_clip':
117
+ print(f"Content (path): {chunk['content']}")
118
+ print(f"Metadata: {chunk['metadata']}")
core/embeddings/audio_embedding_model.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/embeddings/audio_embedding_model.py
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+
6
+ from typing import List
7
+ from transformers import AutoProcessor, AutoModel
8
+ from utils.logger import logger
9
+ from config.model_configs import AUDIO_EMBEDDING_MODEL
10
+
11
+ class AudioEmbeddingModel:
12
+ def __init__(self):
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ logger.info(f"Loading Audio Embedding Model '{AUDIO_EMBEDDING_MODEL}' to device: {self.device}")
15
+
16
+ self.processor = AutoProcessor.from_pretrained(AUDIO_EMBEDDING_MODEL)
17
+ self.model = AutoModel.from_pretrained(AUDIO_EMBEDDING_MODEL).to(self.device)
18
+ logger.info("Audio Embedding Model loaded successfully.")
19
+
20
+ def get_embeddings(self, audio_paths: List[str]) -> List[List[float]]:
21
+ if not audio_paths:
22
+ return []
23
+
24
+ audio_inputs = []
25
+ sample_rate = self.processor.feature_extractor.sampling_rate
26
+
27
+ for audio_path in audio_paths:
28
+ try:
29
+ audio_data, sr = librosa.load(audio_path, sr=sample_rate)
30
+ audio_inputs.append(audio_data)
31
+ except Exception as e:
32
+ logger.warning(f"Could not load audio {audio_path}: {e}. Skipping.")
33
+ continue
34
+
35
+ if not audio_inputs:
36
+ return []
37
+
38
+ inputs = self.processor(audios=audio_inputs, sampling_rate=sample_rate, return_tensors="pt", padding=True).to(self.device)
39
+
40
+ with torch.no_grad():
41
+ audio_features = self.model.get_audio_features(**inputs)
42
+
43
+ embeddings = audio_features / audio_features.norm(p=2, dim=-1, keepdim=True)
44
+
45
+ embeddings_list = embeddings.cpu().tolist()
46
+ logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
47
+ return embeddings_list
48
+
49
+ # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
50
+ if __name__ == "__main__":
51
+ from config.settings import settings
52
+ import os
53
+
54
+ model = AudioEmbeddingModel()
55
+ sample_audio_dir = os.path.join(settings.PROCESSED_DATA_DIR, "audio_segments", "sample_audio") # Giả sử có thư mục audio từ file mẫu
56
+
57
+ # Tạo một audio dummy nếu không có file audio mẫu
58
+ if not os.path.exists(sample_audio_dir) or not os.listdir(sample_audio_dir):
59
+ print(f"Creating a dummy audio for testing at {sample_audio_dir}...")
60
+ os.makedirs(sample_audio_dir, exist_ok=True)
61
+ from pydub import AudioSegment
62
+ dummy_audio = AudioSegment.silent(duration=1000) # 1 giây im lặng
63
+ dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
64
+ dummy_audio.export(dummy_audio_path, format="wav")
65
+ sample_audio_paths = [dummy_audio_path]
66
+ else:
67
+ sample_audio_paths = [os.path.join(sample_audio_dir, f) for f in os.listdir(sample_audio_dir) if f.endswith(('.wav', '.mp3'))]
68
+ if not sample_audio_paths:
69
+ print(f"No audio files found in {sample_audio_dir}. Please ensure sample audio was processed.")
70
+ from pydub import AudioSegment
71
+ dummy_audio = AudioSegment.silent(duration=1000)
72
+ dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
73
+ dummy_audio.export(dummy_audio_path, format="wav")
74
+ sample_audio_paths = [dummy_audio_path]
75
+
76
+ print(f"Using {len(sample_audio_paths)} sample audio clips: {sample_audio_paths[:2]}...")
77
+ embeddings = model.get_embeddings(sample_audio_paths)
78
+
79
+ print(f"Number of embeddings: {len(embeddings)}")
80
+ if embeddings:
81
+ print(f"Dimension of embeddings: {len(embeddings[0])}")
82
+ print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
83
+ if len(embeddings) > 1:
84
+ from sklearn.metrics.pairwise import cosine_similarity
85
+ sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
86
+ print(f"Similarity between audio 1 and 2: {sim:.4f}")
core/embeddings/image_embedding_model.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from typing import List
4
+ from PIL import Image
5
+ from transformers import CLIPProcessor, CLIPModel
6
+ from utils.logger import logger
7
+ from config.model_configs import IMAGE_EMBEDDING_MODEL
8
+
9
+ class ImageEmbeddingModel:
10
+ def __init__(self):
11
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
13
+
14
+ self.model = CLIPModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
15
+ self.processor = CLIPProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
16
+ logger.info("Image Embedding Model loaded successfully.")
17
+
18
+ def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
19
+ if not image_paths:
20
+ return []
21
+
22
+ images = []
23
+ for img_path in image_paths:
24
+ try:
25
+ images.append(Image.open(img_path).convert("RGB"))
26
+ except Exception as e:
27
+ logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
28
+ continue
29
+
30
+ if not images:
31
+ return []
32
+
33
+ inputs = self.processor(images=images, return_tensors="pt").to(self.device)
34
+
35
+ with torch.no_grad():
36
+ image_features = self.model.get_image_features(pixel_values=inputs.pixel_values)
37
+
38
+ embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
39
+
40
+ embeddings_list = embeddings.cpu().tolist()
41
+ logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
42
+ return embeddings_list
43
+
44
+ # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
45
+ if __name__ == "__main__":
46
+ from config.settings import settings
47
+ import os
48
+
49
+ model = ImageEmbeddingModel()
50
+ sample_image_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks/sample_video") # Giả sử có thư mục ảnh từ video mẫu
51
+
52
+ # Tạo một ảnh dummy nếu không có ảnh mẫu
53
+ if not os.path.exists(sample_image_dir) or not os.listdir(sample_image_dir):
54
+ print(f"Creating a dummy image for testing at {sample_image_dir}...")
55
+ os.makedirs(sample_image_dir, exist_ok=True)
56
+ dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
57
+ dummy_img = Image.new('RGB', (60, 30), color = 'red')
58
+ dummy_img.save(dummy_image_path)
59
+ sample_image_paths = [dummy_image_path]
60
+ else:
61
+ sample_image_paths = [os.path.join(sample_image_dir, f) for f in os.listdir(sample_image_dir) if f.endswith(('.jpg', '.png'))]
62
+ if not sample_image_paths: # Nếu thư mục có nhưng không có ảnh
63
+ print(f"No images found in {sample_image_dir}. Please ensure sample video was processed.")
64
+ dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
65
+ dummy_img = Image.new('RGB', (60, 30), color = 'red')
66
+ dummy_img.save(dummy_image_path)
67
+ sample_image_paths = [dummy_image_path]
68
+
69
+ print(f"Using {len(sample_image_paths)} sample images: {sample_image_paths[:2]}...")
70
+ embeddings = model.get_embeddings(sample_image_paths)
71
+
72
+ print(f"Number of embeddings: {len(embeddings)}")
73
+ if embeddings:
74
+ print(f"Dimension of embeddings: {len(embeddings[0])}")
75
+ print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
76
+ # Nếu có đủ ảnh, thử so sánh 2 ảnh đầu
77
+ if len(embeddings) > 1:
78
+ from sklearn.metrics.pairwise import cosine_similarity
79
+ sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
80
+ print(f"Similarity between image 1 and 2: {sim:.4f}")
core/embeddings/text_embedding_model.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from typing import List
4
+ from sentence_transformers import SentenceTransformer
5
+ from utils.logger import logger
6
+ from config.model_configs import TEXT_EMBEDDING_MODEL
7
+
8
+ class TextEmbeddingModel:
9
+ def __init__(self):
10
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ logger.info(f"Loading Text Embedding Model '{TEXT_EMBEDDING_MODEL}' to device: {self.device}")
12
+
13
+ self.model = SentenceTransformer(TEXT_EMBEDDING_MODEL, device=self.device)
14
+ logger.info("Text Embedding Model loaded successfully.")
15
+
16
+ def get_embeddings(self, texts: List[str]) -> List[List[float]]:
17
+ if not texts:
18
+ return []
19
+
20
+ embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
21
+ logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
22
+ return embeddings
23
+
24
+ # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
25
+ if __name__ == "__main__":
26
+ model = TextEmbeddingModel()
27
+ sample_texts = [
28
+ "This is a test sentence.",
29
+ "Another sentence for embedding.",
30
+ "How about some natural language processing?",
31
+ "Xe hơi màu đỏ đang chạy trên đường phố." # Thử với tiếng Việt
32
+ ]
33
+ embeddings = model.get_embeddings(sample_texts)
34
+
35
+ print(f"Number of embeddings: {len(embeddings)}")
36
+ if embeddings:
37
+ print(f"Dimension of embeddings: {len(embeddings[0])}")
38
+ print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
39
+ # Bạn có thể thử tính cosine similarity giữa các embedding ở đây để thấy độ tương đồng
40
+ from sklearn.metrics.pairwise import cosine_similarity
41
+ sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
42
+ print(f"Similarity between text 1 and 2: {sim:.4f}")
core/retrieval/retriever.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/retrieval/retriever.py
2
+ import os
3
+
4
+ from typing import List, Tuple, Dict, Any, Union
5
+ from utils.logger import logger
6
+ from config.settings import settings
7
+ from qdrant_client import QdrantClient
8
+
9
+ from core.embeddings.text_embedding_model import TextEmbeddingModel
10
+ from core.embeddings.image_embedding_model import ImageEmbeddingModel
11
+ from core.embeddings.audio_embedding_model import AudioEmbeddingModel
12
+
13
+ from core.retrieval.vector_db_manager import VectorDBManager
14
+
15
+ class Retriever:
16
+ def __init__(self, client: QdrantClient):
17
+ logger.info("Initializing the Retriever...")
18
+
19
+ # Initialize embedding models
20
+ self.text_embedder = TextEmbeddingModel()
21
+ self.image_embedder = ImageEmbeddingModel()
22
+ self.audio_embedder = AudioEmbeddingModel()
23
+ logger.info("Embedding models initialized.")
24
+
25
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
26
+ self.client = client
27
+ logger.info(f"Single Qdrant client initialized, connected to: {qdrant_db_path}")
28
+
29
+ # Initialize vector database
30
+ text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
31
+ self.text_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_dim, client=self.client)
32
+
33
+ image_dim = 512
34
+ self.image_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_dim, client=self.client)
35
+
36
+ audio_dim = 512
37
+ self.audio_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=audio_dim, client=self.client)
38
+
39
+ logger.info("VectorDB Managers connected to Qdrant collections.")
40
+ logger.info(f"Text collection ('{self.text_db_manager.collection_name}') contains {self.text_db_manager.get_total_vectors()} vectors.")
41
+ logger.info(f"Image collection ('{self.image_db_manager.collection_name}') contains {self.image_db_manager.get_total_vectors()} vectors.")
42
+ logger.info(f"Audio collection ('{self.audio_db_manager.collection_name}') contains {self.audio_db_manager.get_total_vectors()} vectors.")
43
+
44
+ def retrieve(self, query: Union[str, bytes], query_type: str, top_k: int = 5) -> List[Dict[str, Any]]:
45
+ logger.info(f"Received retrieval request. Query type: '{query_type}', Top K: {top_k}")
46
+
47
+ embedding = None
48
+ db_manager_to_use = None
49
+
50
+ # create embeddings
51
+ try:
52
+ if query_type == "text":
53
+ if not isinstance(query, str):
54
+ raise TypeError("Text query must be a string.")
55
+ embedding = self.text_embedder.get_embeddings(query)
56
+ db_manager_to_use = self.text_db_manager
57
+ elif query_type == "image":
58
+ if not isinstance(query, str) or not os.path.exists(query):
59
+ raise TypeError("Image query must be a valid file path.")
60
+ embedding = self.image_embedder.get_embeddings([query])[0]
61
+ db_manager_to_use = self.image_db_manager
62
+ elif query_type == "audio":
63
+ if not isinstance(query, str) or not os.path.exists(query):
64
+ raise TypeError("Audio query must be a valid file path.")
65
+ embedding = self.audio_embedder.get_embeddings([query])[0]
66
+ db_manager_to_use = self.audio_db_manager
67
+ else:
68
+ logger.error(f"Unsupported query type: {query_type}")
69
+ return []
70
+ except Exception as e:
71
+ logger.error(f"Error generating embedding for query: {e}")
72
+ return []
73
+
74
+ if embedding is None:
75
+ logger.warning("Could not generate embedding for the query.")
76
+ return []
77
+
78
+ # searching vectors
79
+ try:
80
+ search_results = db_manager_to_use.search_vectors(embedding, k=top_k)
81
+ except Exception as e:
82
+ logger.error(f"Error searching in vector database: {e}")
83
+ return []
84
+
85
+ formatted_results = []
86
+ for score, payload in search_results:
87
+ formatted_results.append({
88
+ "score": score,
89
+ "metadata": payload['metadata'],
90
+ "content": payload['content']
91
+ })
92
+
93
+ logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
94
+ return formatted_results
95
+
96
+ # def _get_content_from_payload(self, payload: Dict):
97
+ # chunk_type = payload.get("type")
98
+ # if chunk_type == "text":
99
+ # return None # Sẽ cải thiện sau
100
+ # elif chunk_type == 'image' or chunk_type == "audio":
101
+ # return payload.get('chunk_data_path') # Trả về đường dẫn
102
+ # return None
103
+
104
+ def is_database_empty(self) -> bool:
105
+ total_vectors = self.text_db_manager.get_total_vectors() \
106
+ + self.image_db_manager.get_total_vectors() \
107
+ + self.audio_db_manager.get_total_vectors()
108
+
109
+ return total_vectors == 0
110
+
111
+ if __name__ == "__main__":
112
+ from config.settings import settings
113
+
114
+ logger.info("--- Running Retriever Standalone Test (Qdrant version) ---")
115
+
116
+ # Kiểm tra xem Qdrant đã có dữ liệu chưa
117
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
118
+ if not os.path.exists(qdrant_db_path):
119
+ print("\n\nERROR: Qdrant database not found. Please run 'python scripts/ingest_data.py' first to create the database.\n\n")
120
+ else:
121
+ retriever = Retriever()
122
+
123
+ # --- 1. Thử truy vấn văn bản ---
124
+ print("\n--- Testing Text Retrieval ---")
125
+ text_query = "What is artificial intelligence?"
126
+ text_results = retriever.retrieve(text_query, query_type="text", top_k=3)
127
+ print(f"Query: '{text_query}'")
128
+ for i, result in enumerate(text_results):
129
+ print(f" Result {i+1}:")
130
+ print(f" Score: {result['score']:.4f}")
131
+ print(f" Type: {result['metadata']['type']}")
132
+ print(f" Content Preview: {str(result.get('content'))[:200] if result.get('content') else 'N/A'}...")
133
+ print(f" Source: {result['metadata']['source_id']}")
134
+
135
+ # --- 2. Thử truy vấn hình ảnh ---
136
+ print("\n--- Testing Image Retrieval ---")
137
+ # Lấy một ảnh từ các chunk đã xử lý để làm truy vấn
138
+ image_to_query = None
139
+ image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks")
140
+ if os.path.exists(image_chunks_dir):
141
+ for root, _, files in os.walk(image_chunks_dir):
142
+ if files:
143
+ image_to_query = os.path.join(root, files[0])
144
+ break
145
+
146
+ if image_to_query and os.path.exists(image_to_query):
147
+ print(f"Using image as query: {image_to_query}")
148
+ image_results = retriever.retrieve(image_to_query, query_type="image", top_k=3)
149
+ for i, result in enumerate(image_results):
150
+ print(f" Result {i+1}:")
151
+ print(f" Score: {result['score']:.4f}")
152
+ print(f" Type: {result['metadata']['type']}")
153
+ print(f" Content (Paths): {result['content']}")
154
+ print(f" Source: {result['metadata']['source_id']}")
155
+ else:
156
+ print("Could not find a sample image to test image retrieval.")
core/retrieval/vector_db_manager.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import List, Tuple, Dict, Any
4
+ from uuid import uuid4
5
+
6
+ from qdrant_client import QdrantClient, models
7
+ from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
8
+
9
+ from utils.logger import logger
10
+ from config.settings import settings
11
+
12
+ class VectorDBManager:
13
+ def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
14
+ logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
15
+
16
+ if client:
17
+ self.client = client
18
+ logger.info("Using shared Qdrant client instance.")
19
+ else:
20
+ logger.warning("No shared Qdrant client provided. Creating a new local instance.")
21
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
22
+ self.client = QdrantClient(path=qdrant_db_path)
23
+
24
+ self.collection_name = collection_name
25
+ self.embedding_dim = embedding_dim
26
+
27
+ self.create_collection_if_not_exists()
28
+
29
+ def create_collection_if_not_exists(self):
30
+ try:
31
+ collections = self.client.get_collections().collections
32
+ collection_names = [collection.name for collection in collections]
33
+
34
+ if self.collection_name not in collection_names:
35
+ logger.info(f"Collection '{self.collection_name}' not found. Creating a new one...")
36
+
37
+ self.client.recreate_collection(
38
+ collection_name=self.collection_name,
39
+ vectors_config=VectorParams(
40
+ size=self.embedding_dim,
41
+ distance=Distance.COSINE
42
+ )
43
+ )
44
+ logger.success(f"Collection '{self.collection_name}' created successfully.")
45
+ else:
46
+ logger.info(f"Collection '{self.collection_name}' already exists.")
47
+
48
+ except Exception as e:
49
+ logger.error(f"Error checking or creating collection '{self.collection_name}': {e}")
50
+ raise
51
+
52
+ def add_vectors(self, embeddings: List[List[float]], metadatas: List[Dict[str, Any]]):
53
+ if not embeddings:
54
+ logger.warning("No embeddings to add. Skipping.")
55
+ return
56
+
57
+ if len(embeddings) != len(metadatas):
58
+ logger.error("Number of embeddings and metadatas must match.")
59
+ raise ValueError("Embeddings and metadatas count mismatch.")
60
+
61
+ points_to_add = []
62
+ for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)):
63
+ point_id = str(uuid4())
64
+
65
+ points_to_add.append(
66
+ PointStruct(
67
+ id=point_id,
68
+ vector=embedding,
69
+ payload=metadata
70
+ )
71
+ )
72
+
73
+ try:
74
+ operation_info = self.client.upsert(
75
+ collection_name=self.collection_name,
76
+ wait=True,
77
+ points=points_to_add
78
+ )
79
+ if operation_info.status == UpdateStatus.COMPLETED:
80
+ logger.debug(f"Successfully upserted {len(points_to_add)} points to collection '{self.collection_name}'.")
81
+ else:
82
+ logger.warning(f"Upsert operation finished with status: {operation_info.status}")
83
+ except Exception as e:
84
+ logger.error(f"Error upserting points to collection '{self.collection_name}': {e}")
85
+
86
+ def search_vectors(self, query_embedding: List[float], k: int = 5, filter_payload: Dict = None) -> List[Tuple[float, Dict[str, Any]]]:
87
+ try:
88
+ search_results = self.client.search(
89
+ collection_name=self.collection_name,
90
+ query_vector=query_embedding,
91
+ query_filter=filter_payload,
92
+ limit=k,
93
+ with_payload=True, # include payload in return
94
+ with_vectors=False # exclude vectors in return
95
+ )
96
+
97
+ formatted_results = []
98
+ for scored_point in search_results:
99
+ score = scored_point.score
100
+ payload = scored_point.payload
101
+ formatted_results.append((score, payload))
102
+
103
+ logger.debug(f"Searched for top {k} neighbors. Found {len(formatted_results)} results.")
104
+ return formatted_results
105
+ except Exception as e:
106
+ logger.error(f"Error searching in collection '{self.collection_name}': {e}")
107
+ return []
108
+
109
+ def get_total_vectors(self) -> int:
110
+ try:
111
+ count_result = self.client.count(
112
+ collection_name=self.collection_name,
113
+ exact=True # Đếm chính xác
114
+ )
115
+ return count_result.count
116
+ except Exception as e:
117
+ logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
118
+ return 0
119
+
120
+ # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
121
+ if __name__ == "__main__":
122
+ import numpy as np
123
+
124
+ # Các thông số cho collection test
125
+ TEST_COLLECTION_NAME = "my_test_collection"
126
+ DUMMY_DIM = 128
127
+
128
+ # --- Kiểm tra tạo collection ---
129
+ print("\n--- Testing Collection Creation ---")
130
+ db_manager = VectorDBManager(collection_name=TEST_COLLECTION_NAME, embedding_dim=DUMMY_DIM)
131
+ print(f"Total vectors initially: {db_manager.get_total_vectors()}")
132
+
133
+ # --- Kiểm tra thêm vector và payload ---
134
+ print("\n--- Testing Add Vectors ---")
135
+ dummy_embeddings = np.random.rand(10, DUMMY_DIM).tolist()
136
+ dummy_metadatas = [
137
+ {"chunk_id": f"dummy_chunk_{i}", "type": "text" if i < 5 else "image", "source_file": "test.txt"}
138
+ for i in range(10)
139
+ ]
140
+ db_manager.add_vectors(dummy_embeddings, dummy_metadatas)
141
+ print(f"Total vectors after adding: {db_manager.get_total_vectors()}")
142
+
143
+ # --- Kiểm tra tìm kiếm ---
144
+ print("\n--- Testing Search ---")
145
+ dummy_query = np.random.rand(DUMMY_DIM).tolist()
146
+ results = db_manager.search_vectors(dummy_query, top_k=3)
147
+ print(f"Top 3 results (no filter):")
148
+ for score, payload in results:
149
+ print(f" Score: {score:.4f}, Payload: {payload}")
150
+
151
+ # --- Kiểm tra tìm kiếm CÓ LỌC (Pre-filtering) ---
152
+ print("\n--- Testing Search with Filter ---")
153
+ filter_condition = models.Filter(
154
+ must=[
155
+ models.FieldCondition(
156
+ key="type", # Lọc theo trường 'type' trong payload
157
+ match=models.MatchValue(value="image"), # Giá trị phải là 'image'
158
+ )
159
+ ]
160
+ )
161
+ filtered_results = db_manager.search_vectors(dummy_query, top_k=3, filter_payload=filter_condition)
162
+ print(f"Top 3 results (filtered for type='image'):")
163
+ for score, payload in filtered_results:
164
+ print(f" Score: {score:.4f}, Payload: {payload}")
165
+
166
+ # --- Dọn dẹp collection test ---
167
+ print("\n--- Cleaning up test collection ---")
168
+ db_manager.client.delete_collection(collection_name=TEST_COLLECTION_NAME)
169
+ print(f"Collection '{TEST_COLLECTION_NAME}' deleted.")
requirements.txt ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.5.0
3
+ annotated-types==0.7.0
4
+ anyio==4.9.0
5
+ attrs==25.3.0
6
+ audioread==3.0.1
7
+ certifi==2025.7.14
8
+ cffi==1.17.1
9
+ charset-normalizer==3.4.2
10
+ click==8.2.1
11
+ colorama==0.4.6
12
+ contourpy==1.3.3
13
+ cycler==0.12.1
14
+ decorator==4.4.2
15
+ faiss-cpu==1.11.0.post1
16
+ fastapi==0.116.1
17
+ ffmpy==0.6.1
18
+ filelock==3.13.1
19
+ fonttools==4.59.0
20
+ fsspec==2024.6.1
21
+ gradio==5.6.0
22
+ gradio_client==1.4.3
23
+ grpcio==1.74.0
24
+ grpcio-tools==1.74.0
25
+ h11==0.16.0
26
+ h2==4.2.0
27
+ hpack==4.1.0
28
+ httpcore==1.0.9
29
+ httpx==0.28.1
30
+ huggingface-hub==0.34.3
31
+ hyperframe==6.1.0
32
+ idna==3.10
33
+ imageio==2.37.0
34
+ imageio-ffmpeg==0.6.0
35
+ importlib_resources==6.5.2
36
+ inquirerpy==0.3.4
37
+ Jinja2==3.1.4
38
+ joblib==1.5.1
39
+ jsonpatch==1.33
40
+ jsonpointer==3.0.0
41
+ jsonschema==4.25.0
42
+ jsonschema-specifications==2025.4.1
43
+ kiwisolver==1.4.8
44
+ langchain-core==0.2.43
45
+ langchain-text-splitters==0.2.0
46
+ langsmith==0.1.147
47
+ lazy_loader==0.4
48
+ librosa==0.10.2
49
+ llvmlite==0.44.0
50
+ loguru==0.7.2
51
+ markdown-it-py==3.0.0
52
+ MarkupSafe==2.1.5
53
+ matplotlib==3.10.5
54
+ mdurl==0.1.2
55
+ moviepy==1.0.3
56
+ mpmath==1.3.0
57
+ msgpack==1.1.1
58
+ narwhals==2.0.1
59
+ networkx==3.3
60
+ numba==0.61.2
61
+ numpy==1.26.4
62
+ opencv-python==4.12.0.88
63
+ orjson==3.11.1
64
+ packaging==24.2
65
+ pandas==2.3.1
66
+ pfzy==0.3.4
67
+ pillow==10.4.0
68
+ platformdirs==4.3.8
69
+ pooch==1.8.2
70
+ portalocker==2.10.1
71
+ proglog==0.1.12
72
+ prompt_toolkit==3.0.51
73
+ protobuf==6.31.1
74
+ pycparser==2.22
75
+ pydantic==2.10.6
76
+ pydantic-settings==2.3.4
77
+ pydantic_core==2.27.2
78
+ pydub==0.25.1
79
+ Pygments==2.19.2
80
+ pyparsing==3.2.3
81
+ python-dateutil==2.9.0.post0
82
+ python-dotenv==1.1.1
83
+ python-multipart==0.0.12
84
+ pytz==2025.2
85
+ pywin32==311
86
+ PyYAML==6.0.2
87
+ qdrant-client==1.9.0
88
+ referencing==0.36.2
89
+ regex==2024.11.6
90
+ requests==2.32.4
91
+ requests-toolbelt==1.0.0
92
+ rich==14.1.0
93
+ rpds-py==0.26.0
94
+ ruff==0.12.7
95
+ safehttpx==0.1.6
96
+ safetensors==0.5.3
97
+ scikit-learn==1.7.1
98
+ scipy==1.16.1
99
+ semantic-version==2.10.0
100
+ sentence-transformers==5.0.0
101
+ shellingham==1.5.4
102
+ six==1.17.0
103
+ sniffio==1.3.1
104
+ soundfile==0.13.1
105
+ soxr==0.5.0.post1
106
+ starlette==0.47.2
107
+ sympy==1.13.1
108
+ tenacity==8.5.0
109
+ threadpoolctl==3.6.0
110
+ tokenizers==0.19.1
111
+ tomlkit==0.12.0
112
+ torch==2.6.0+cu124
113
+ torchaudio==2.6.0+cu124
114
+ torchvision==0.21.0+cu124
115
+ tqdm==4.67.1
116
+ transformers==4.41.2
117
+ typer==0.16.0
118
+ typing-inspection==0.4.1
119
+ typing_extensions==4.12.2
120
+ tzdata==2025.2
121
+ urllib3==2.5.0
122
+ uvicorn==0.35.0
123
+ wcwidth==0.2.13
124
+ websockets==11.0.3
125
+ win32_setctime==1.2.0
scripts/ingest_data.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/ingest_data.py
2
+ import os
3
+ import json
4
+ from tqdm import tqdm
5
+ from pathlib import Path
6
+ from typing import Dict
7
+ import shutil
8
+
9
+ from config.settings import settings
10
+ from utils.logger import logger
11
+ from qdrant_client import QdrantClient
12
+
13
+ # Import các Processor (không thay đổi)
14
+ from core.data_processing.text_processor import TextProcessor
15
+ from core.data_processing.audio_processor import AudioProcessor
16
+ # from core.data_processing.video_processor import VideoProcessor
17
+ from core.data_processing.image_processor import ImageProcessor
18
+
19
+ # Import các Embedding Model (không thay đổi)
20
+ from core.embeddings.text_embedding_model import TextEmbeddingModel
21
+ from core.embeddings.image_embedding_model import ImageEmbeddingModel
22
+ from core.embeddings.audio_embedding_model import AudioEmbeddingModel
23
+
24
+ # Import VectorDBManager phiên bản Qdrant MỚI
25
+ from core.retrieval.vector_db_manager import VectorDBManager
26
+
27
+ def walk_through_files(extentions: Dict, raw_dir: str, all_raw_chunks_from_processors, processor):
28
+ all_files = list(raw_dir.rglob("*"))
29
+ for filepath in tqdm(all_files, desc="Processing " + raw_dir.name):
30
+ if filepath.suffix in extentions and filepath.is_file():
31
+ all_raw_chunks_from_processors.extend(
32
+ processor.process(str(filepath))
33
+ )
34
+
35
+ def ingest_data_pipeline():
36
+ logger.info("Starting comprehensive data ingestion pipeline (Chunking + Embedding + Qdrant Indexing)...")
37
+
38
+ # --- 1. Khởi tạo các Processor --- (Không thay đổi)
39
+ text_processor = TextProcessor(chunk_size=500, chunk_overlap=50)
40
+ audio_processor = AudioProcessor(min_silence_len=1000, silence_thresh_db=-40, target_sr=16000)
41
+ image_processor = ImageProcessor()
42
+ # video_processor = VideoProcessor(chunk_duration_sec=15, frames_per_segment=5)
43
+
44
+ # --- Dọn dẹp các thư mục chunk và Qdrant data cũ ---
45
+ dirs_to_clean_and_create = [
46
+ settings.CHUNKS_DIR,
47
+ settings.METADATA_DIR
48
+ ]
49
+ # Thư mục dữ liệu của Qdrant
50
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
51
+ dirs_to_clean_and_create.append(qdrant_db_path)
52
+
53
+ for dir_path in dirs_to_clean_and_create:
54
+ if os.path.exists(dir_path):
55
+ shutil.rmtree(dir_path)
56
+ logger.info(f"Cleaned up old directory: {dir_path}")
57
+ # Tạo lại các thư mục cho chunking, trừ thư mục qdrant (client sẽ tự tạo)
58
+ if dir_path != qdrant_db_path:
59
+ os.makedirs(dir_path, exist_ok=True)
60
+
61
+ logger.info("Output directories and previous Qdrant data are ready for fresh ingestion.")
62
+
63
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
64
+ client = QdrantClient(path=qdrant_db_path)
65
+ logger.info(f"Single Qdrant client initialized for ingestion, connected to: {qdrant_db_path}")
66
+
67
+ all_raw_chunks_from_processors = [] # Chứa tất cả các chunk (bao gồm content và metadata)
68
+
69
+ # --- 2. Chạy Data Processing (Chunking) --- (Không thay đổi)
70
+ logger.info("--- Phase 1: Processing Raw Data into Chunks ---")
71
+
72
+ # Xử lý Văn bản
73
+ text_extentions = {".txt"}
74
+ text_raw_dir = Path(settings.RAW_DATA_DIR) / "texts"
75
+ walk_through_files(text_extentions, text_raw_dir, all_raw_chunks_from_processors, text_processor)
76
+
77
+ # Xử lý Âm thanh
78
+ audio_extentions = {".wav", ".mp3"}
79
+ audio_raw_dir = Path(settings.RAW_DATA_DIR) / "audios"
80
+ walk_through_files(audio_extentions, audio_raw_dir, all_raw_chunks_from_processors, audio_processor)
81
+
82
+ # process images
83
+ image_extentions = {".jpg", ".png"}
84
+ image_raw_dir = Path(settings.RAW_DATA_DIR) / "images"
85
+ walk_through_files(image_extentions, image_raw_dir, all_raw_chunks_from_processors, image_processor)
86
+
87
+ # Xử lý Video
88
+ # video_raw_dir = os.path.join(settings.RAW_DATA_DIR, "videos")
89
+ # for filename in tqdm(os.listdir(video_raw_dir), desc="Processing Video"):
90
+ # if filename.endswith((".mp4", ".avi", ".mov")):
91
+ # all_raw_chunks_from_processors.extend(video_processor.process_video(os.path.join(video_raw_dir, filename)))
92
+
93
+ logger.info(f"Total raw chunks processed from all sources: {len(all_raw_chunks_from_processors)}")
94
+
95
+ # --- 3. Tạo Embedding và Thêm vào Qdrant ---
96
+ logger.info("--- Phase 2: Generating Embeddings and Building Qdrant Collections ---")
97
+
98
+ # Khởi tạo các Embedding Model
99
+ text_embedder = TextEmbeddingModel()
100
+ image_embedder = ImageEmbeddingModel()
101
+ audio_embedder = AudioEmbeddingModel()
102
+
103
+ # --- Khởi tạo các VectorDBManager cho Qdrant ---
104
+ # Lấy kích thước embedding từ model để đảm bảo chính xác
105
+ text_embedding_dim = text_embedder.model.get_sentence_embedding_dimension()
106
+ text_vector_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_embedding_dim, client=client)
107
+
108
+ # Kích thước embedding cho image/audio (giả định là 512)
109
+ image_embedding_dim = 512
110
+ image_vector_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_embedding_dim, client=client)
111
+
112
+ # video_frame_embedding_dim = 512
113
+ # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
114
+
115
+ audio_embedding_dim = 512
116
+ audio_vector_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=image_embedding_dim, client=client)
117
+
118
+ logger.info(f"Initialized Text Qdrant Collection Manager with {text_embedding_dim}D.")
119
+ logger.info(f"Initialized Image Qdrant Collection Manager with {image_embedding_dim}D.")
120
+ logger.info(f"Initialized Audio Qdrant Collection Manager with {audio_embedding_dim}D.")
121
+
122
+ # Tạo các batch để thêm vào Qdrant hiệu quả hơn
123
+ text_embeddings_batch = []
124
+ text_metadatas_batch = []
125
+
126
+ image_embeddings_batch = []
127
+ image_metadatas_batch = []
128
+
129
+ # video_frame_embeddings_batch = []
130
+ # video_frame_metadatas_batch = []
131
+
132
+ audio_embeddings_batch = []
133
+ audio_metadatas_batch = []
134
+
135
+ BATCH_SIZE = 32 # Thêm 32 điểm một lần
136
+
137
+ for chunk_data in tqdm(all_raw_chunks_from_processors, desc="Generating Embeddings & Populating Qdrant"):
138
+ chunk_type = chunk_data['metadata']['type']
139
+ content = chunk_data['content']
140
+
141
+ try:
142
+ if chunk_type == "text":
143
+ embedding = text_embedder.get_embeddings([content])[0]
144
+ text_embeddings_batch.append(embedding)
145
+ text_metadatas_batch.append(chunk_data)
146
+
147
+ elif chunk_type == "audio":
148
+ embedding = audio_embedder.get_embeddings([content])[0]
149
+ audio_embeddings_batch.append(embedding)
150
+ audio_metadatas_batch.append(chunk_data)
151
+
152
+ elif chunk_type == "image":
153
+ embedding = image_embedder.get_embeddings([content])[0]
154
+ image_embeddings_batch.append(embedding)
155
+ image_metadatas_batch.append(chunk_data)
156
+
157
+ # elif chunk_type == "video_frame":
158
+ # if content and isinstance(content, list) and len(content) > 0:
159
+ # embedding = image_embedder.get_embeddings([content[0]])[0] # Chỉ nhúng ảnh đầu tiên
160
+ # video_frame_embeddings_batch.append(embedding)
161
+ # video_frame_metadatas_batch.append(chunk_data['metadata'])
162
+
163
+ # Xử lý batch
164
+ if len(text_embeddings_batch) >= BATCH_SIZE:
165
+ text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
166
+ text_embeddings_batch, text_metadatas_batch = [], [] # Reset batch
167
+
168
+ if len(audio_embeddings_batch) >= BATCH_SIZE:
169
+ audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
170
+ audio_embeddings_batch, audio_metadatas_batch = [], [] # Reset batch
171
+
172
+ if len(image_embeddings_batch) >= BATCH_SIZE:
173
+ image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
174
+ image_embeddings_batch, image_metadatas_batch = [], [] # Reset batch
175
+
176
+ # if len(video_frame_embeddings_batch) >= BATCH_SIZE:
177
+ # video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
178
+ # video_frame_embeddings_batch, video_frame_metadatas_batch = [], [] # Reset batch
179
+
180
+ except Exception as e:
181
+ logger.error(f"Error processing chunk {chunk_data['metadata']['chunk_id']}: {e}")
182
+
183
+ # Thêm các embedding còn lại trong batch cuối cùng
184
+ if text_embeddings_batch:
185
+ text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
186
+ if audio_embeddings_batch:
187
+ audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
188
+ if image_embeddings_batch:
189
+ image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
190
+ # if video_frame_embeddings_batch:
191
+ # video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
192
+
193
+ logger.success("Finished populating Qdrant collections.")
194
+ logger.info(f"Total vectors in 'text_collection': {text_vector_db_manager.get_total_vectors()}")
195
+ logger.info(f"Total vectors in 'audio_collection': {audio_vector_db_manager.get_total_vectors()}")
196
+ logger.info(f"Total vectors in 'image_collection': {image_vector_db_manager.get_total_vectors()}")
197
+ # logger.info(f"Total vectors in 'video_frame_collection': {video_frame_vector_db_manager.get_total_vectors()}")
198
+
199
+ logger.info("Data ingestion pipeline completed successfully!")
200
+
201
+
202
+ if __name__ == "__main__":
203
+ ingest_data_pipeline()
scripts/ingestion.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/ingestion/ingestion_service.py
2
+ import os
3
+ import gradio as gr
4
+ from typing import List, Optional, Callable
5
+ from tqdm import tqdm
6
+
7
+ from utils.logger import logger
8
+ from config.settings import settings
9
+ from qdrant_client import QdrantClient
10
+
11
+ # Import các Processor (không thay đổi)
12
+ from core.data_processing.text_processor import TextProcessor
13
+ from core.data_processing.audio_processor import AudioProcessor
14
+ # from core.data_processing.video_processor import VideoProcessor
15
+ from core.data_processing.image_processor import ImageProcessor
16
+
17
+ # Import các Embedding Model (không thay đổi)
18
+ from core.embeddings.text_embedding_model import TextEmbeddingModel
19
+ from core.embeddings.image_embedding_model import ImageEmbeddingModel
20
+ from core.embeddings.audio_embedding_model import AudioEmbeddingModel
21
+
22
+ # Import VectorDBManager phiên bản Qdrant MỚI
23
+ from core.retrieval.vector_db_manager import VectorDBManager
24
+
25
+ class IngestionService:
26
+ def __init__(self, client: QdrantClient):
27
+ """
28
+ Khởi tạo IngestionService với một QdrantClient được chia sẻ.
29
+ Phiên bản này không theo dõi trạng thái file.
30
+ """
31
+ logger.info("Initializing IngestionService (Stateless)...")
32
+
33
+ self.client = client
34
+ self.text_processor = TextProcessor()
35
+ self.image_processor = ImageProcessor()
36
+ self.audio_processor = AudioProcessor()
37
+ # self.video_processor = VideoProcessor()
38
+ self.text_embedder = TextEmbeddingModel()
39
+ self.image_embedder = ImageEmbeddingModel()
40
+ self.audio_embedder = AudioEmbeddingModel()
41
+
42
+ text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
43
+ self.text_db_manager = VectorDBManager(
44
+ client=self.client,
45
+ collection_name="text_collection",
46
+ embedding_dim=text_dim
47
+ )
48
+
49
+ image_embedding_dim = 512
50
+ self.image_vector_db_manager = VectorDBManager(
51
+ client=self.client,
52
+ collection_name="image_collection",
53
+ embedding_dim=image_embedding_dim
54
+ )
55
+
56
+ audio_embedding_dim = 512
57
+ self.audio_vector_db_manager = VectorDBManager(
58
+ client=self.client,
59
+ collection_name="audio_collection",
60
+ embedding_dim=audio_embedding_dim
61
+ )
62
+
63
+ # video_frame_embedding_dim = 512
64
+ # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
65
+
66
+
67
+ logger.info("IngestionService initialized successfully.")
68
+
69
+ def ingest_files(self, file_paths: List[str]):
70
+ """
71
+ Xử lý một danh sách các file, tạo embedding, và thêm vào Qdrant.
72
+ Hàm này giả định các file đã được đặt vào đúng thư mục trong 'raw'.
73
+ """
74
+ return self.ingest_files_with_progress(file_paths, None)
75
+
76
+ def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
77
+ """
78
+ Xử lý một danh sách các file với progress tracking.
79
+ """
80
+ logger.info(f"Starting ingestion for {len(file_paths)} files...")
81
+
82
+ if progress_callback:
83
+ progress_callback(0.4, desc="Starting file processing...")
84
+
85
+ all_chunks_to_process = []
86
+
87
+ # 1. Quét qua các file và tạo chunk
88
+ for i, file_path in enumerate(file_paths):
89
+ base_progress = 0.4 + (i / len(file_paths)) * 0.3 # 40% -> 70%
90
+ file_name = os.path.basename(file_path)
91
+
92
+ if progress_callback:
93
+ progress_callback(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
94
+
95
+ # Xác định loại dữ liệu dựa trên phần mở rộng file
96
+ file_ext = os.path.splitext(file_path)[1].lower()
97
+ data_type = None
98
+ try:
99
+ if progress_callback:
100
+ progress_callback(base_progress + 0.01, desc=f"Reading {file_name}...")
101
+
102
+ if file_ext in ['.txt']:
103
+ data_type = 'text'
104
+ chunks = self.text_processor.process(file_path)
105
+ elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
106
+ data_type = 'image'
107
+ chunks = self.image_processor.process(file_path)
108
+ elif file_ext in ['.wav', '.mp3']:
109
+ data_type = 'audio'
110
+ chunks = self.audio_processor.process(file_path)
111
+ # elif file_ext in ['.mp4', '.avi', '.mov']:
112
+ # data_type = 'video'
113
+ # chunks = self.video_processor.process_video(file_path)
114
+ else:
115
+ logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
116
+ continue
117
+
118
+ if progress_callback:
119
+ progress_callback(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
120
+
121
+ all_chunks_to_process.extend(chunks)
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error processing file {file_path}: {e}")
125
+ continue
126
+
127
+ if not all_chunks_to_process:
128
+ logger.warning("No processable chunks were generated from the provided files.")
129
+ return
130
+
131
+ logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
132
+
133
+ if progress_callback:
134
+ progress_callback(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
135
+
136
+ # 2. Tạo embedding và thêm vào Qdrant (theo batch)
137
+ text_embeddings_batch, text_metadatas_batch = [], []
138
+ audio_embeddings_batch, audio_metadatas_batch = [], []
139
+ image_embeddings_batch, image_metadatas_batch = [], []
140
+ BATCH_SIZE = 32
141
+
142
+ for i, chunk_data in enumerate(all_chunks_to_process):
143
+ # Tính toán progress chi tiết hơn
144
+ base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25 # 70% -> 95%
145
+
146
+ chunk_type = chunk_data['metadata']['type']
147
+ content = chunk_data['content']
148
+ chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
149
+
150
+ try:
151
+ if progress_callback:
152
+ progress_callback(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
153
+
154
+ embedding = None
155
+ if chunk_type == "text":
156
+ if progress_callback:
157
+ progress_callback(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
158
+ embedding = self.text_embedder.get_embeddings([content])[0]
159
+ text_embeddings_batch.append(embedding)
160
+ text_metadatas_batch.append(chunk_data)
161
+ elif chunk_type == "audio":
162
+ if progress_callback:
163
+ progress_callback(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
164
+ embedding = self.audio_embedder.get_embeddings([content])[0]
165
+ audio_embeddings_batch.append(embedding)
166
+ audio_metadatas_batch.append(chunk_data)
167
+ elif chunk_type == "image":
168
+ if progress_callback:
169
+ progress_callback(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
170
+ embedding = self.image_embedder.get_embeddings([content])[0]
171
+ image_embeddings_batch.append(embedding)
172
+ image_metadatas_batch.append(chunk_data)
173
+
174
+ # Thêm batch khi đủ kích thước với progress update
175
+ if len(text_embeddings_batch) >= BATCH_SIZE:
176
+ if progress_callback:
177
+ progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
178
+ self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
179
+ text_embeddings_batch, text_metadatas_batch = [], []
180
+
181
+ if len(audio_embeddings_batch) >= BATCH_SIZE:
182
+ if progress_callback:
183
+ progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
184
+ self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
185
+ audio_embeddings_batch, audio_metadatas_batch = [], []
186
+
187
+ if len(image_embeddings_batch) >= BATCH_SIZE:
188
+ if progress_callback:
189
+ progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
190
+ self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
191
+ image_embeddings_batch, image_metadatas_batch = [], []
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error ingesting chunk {chunk_id}: {e}")
195
+
196
+ if progress_callback:
197
+ progress_callback(0.95, desc="Saving final batches...")
198
+
199
+ # Thêm các embedding còn lại trong batch cuối cùng
200
+ final_operations = []
201
+ if text_embeddings_batch:
202
+ final_operations.append(("text", len(text_embeddings_batch)))
203
+ if audio_embeddings_batch:
204
+ final_operations.append(("audio", len(audio_embeddings_batch)))
205
+ if image_embeddings_batch:
206
+ final_operations.append(("image", len(image_embeddings_batch)))
207
+
208
+ for i, (batch_type, count) in enumerate(final_operations):
209
+ current_progress = 0.95 + (i / len(final_operations)) * 0.04 # 95% -> 99%
210
+
211
+ if batch_type == "text" and text_embeddings_batch:
212
+ if progress_callback:
213
+ progress_callback(current_progress, desc=f"Saving final {count} text embeddings...")
214
+ self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
215
+ elif batch_type == "audio" and audio_embeddings_batch:
216
+ if progress_callback:
217
+ progress_callback(current_progress, desc=f"Saving final {count} audio embeddings...")
218
+ self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
219
+ elif batch_type == "image" and image_embeddings_batch:
220
+ if progress_callback:
221
+ progress_callback(current_progress, desc=f"Saving final {count} image embeddings...")
222
+ self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
223
+
224
+ if progress_callback:
225
+ progress_callback(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
226
+
227
+ logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")
utils/logger.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from loguru import logger
3
+ from config.settings import settings # Import settings của chúng ta
4
+
5
+ # Cấu hình logger
6
+ logger.remove() # Gỡ bỏ cấu hình mặc định
7
+ logger.add(
8
+ "logs/file_{time}.log", # Lưu log vào tệp, với tên tệp theo thời gian
9
+ rotation="10 MB", # Xoay tệp log khi đạt 10MB
10
+ compression="zip", # Nén tệp log cũ
11
+ level=settings.LOG_LEVEL, # Mức độ log từ settings
12
+ colorize=True, # Tô màu output trên console
13
+ format="{time} {level} {message}",
14
+ enqueue=True # Sử dụng hàng đợi để ghi log không chặn (quan trọng cho các ứng dụng đa luồng/async)
15
+ )
16
+ logger.add(
17
+ sys.stderr, # Ghi log ra console
18
+ level=settings.LOG_LEVEL,
19
+ colorize=True,
20
+ format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
21
+ )
22
+
23
+ # Xuất logger để các module khác có thể import và sử dụng
24
+ __all__ = ["logger"]