fix metadata
Browse files- app/gemini_client.py +3 -3
- app/law_document_chunker.py +25 -9
- app/main.py +1 -0
app/gemini_client.py
CHANGED
@@ -103,7 +103,7 @@ class GeminiClient:
|
|
103 |
logger.error(f"[GEMINI] Error counting tokens: {e}")
|
104 |
return 0
|
105 |
|
106 |
-
def create_embedding(self, text: str, model: Optional[str] = None) -> list:
|
107 |
last_error = None
|
108 |
max_retries = 3
|
109 |
|
@@ -117,13 +117,13 @@ class GeminiClient:
|
|
117 |
if not use_model:
|
118 |
raise ValueError("No model specified for embedding")
|
119 |
|
120 |
-
logger.info(f"[GEMINI][EMBEDDING] Using model={use_model} (requested={model}, default={default_model})")
|
121 |
|
122 |
configure(api_key=key)
|
123 |
response = embed_content(
|
124 |
model=use_model,
|
125 |
content=text,
|
126 |
-
task_type=
|
127 |
)
|
128 |
|
129 |
self.limit_manager.log_request(key, use_model, success=True)
|
|
|
103 |
logger.error(f"[GEMINI] Error counting tokens: {e}")
|
104 |
return 0
|
105 |
|
106 |
+
def create_embedding(self, text: str, model: Optional[str] = None, task_type: str = "retrieval_query") -> list:
|
107 |
last_error = None
|
108 |
max_retries = 3
|
109 |
|
|
|
117 |
if not use_model:
|
118 |
raise ValueError("No model specified for embedding")
|
119 |
|
120 |
+
logger.info(f"[GEMINI][EMBEDDING] Using model={use_model} (requested={model}, default={default_model}), task_type={task_type}")
|
121 |
|
122 |
configure(api_key=key)
|
123 |
response = embed_content(
|
124 |
model=use_model,
|
125 |
content=text,
|
126 |
+
task_type=task_type
|
127 |
)
|
128 |
|
129 |
self.limit_manager.log_request(key, use_model, success=True)
|
app/law_document_chunker.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import re
|
2 |
import os
|
3 |
import uuid
|
4 |
-
from typing import List, Dict, Optional, Tuple
|
5 |
from dataclasses import dataclass
|
6 |
from loguru import logger
|
7 |
from .supabase_db import SupabaseClient
|
@@ -30,6 +30,7 @@ class LawDocumentChunker:
|
|
30 |
settings = get_settings()
|
31 |
self.supabase_client = SupabaseClient(settings.supabase_url, settings.supabase_key)
|
32 |
self.embedding_client = EmbeddingClient()
|
|
|
33 |
|
34 |
# Regex patterns cho các cấp độ cấu trúc - SỬA LẠI ĐỂ CHÍNH XÁC HƠN
|
35 |
# Đảm bảo mỗi pattern có đúng số group
|
@@ -417,17 +418,14 @@ class LawDocumentChunker:
|
|
417 |
|
418 |
for i, chunk in enumerate(chunks, 1):
|
419 |
try:
|
420 |
-
#
|
421 |
-
|
422 |
-
|
423 |
-
# # Tạo context_summary bằng LLM
|
424 |
-
# context_summary = await self._create_context_summary_with_llm(chunk.content, chunk)
|
425 |
|
426 |
# Chuẩn bị data cho Supabase
|
427 |
chunk_dict = {
|
428 |
'id': chunk.id,
|
429 |
'content': chunk.content,
|
430 |
-
'embedding': [0.0] * 768, #
|
431 |
'vanbanid': chunk.vanbanid,
|
432 |
'cha': chunk.cha,
|
433 |
'document_title': chunk.document_title,
|
@@ -435,7 +433,7 @@ class LawDocumentChunker:
|
|
435 |
'article_title': chunk.article_title,
|
436 |
'clause_number': chunk.clause_number,
|
437 |
'sub_clause_letter': chunk.sub_clause_letter,
|
438 |
-
'context_summary': f"Structure:
|
439 |
}
|
440 |
|
441 |
# Lưu ngay lập tức vào Supabase
|
@@ -508,4 +506,22 @@ class LawDocumentChunker:
|
|
508 |
|
509 |
except Exception as e:
|
510 |
logger.error(f"[CHUNKER] Error processing document {document_id}: {e}") ##
|
511 |
-
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
import os
|
3 |
import uuid
|
4 |
+
from typing import List, Dict, Optional, Tuple, Any
|
5 |
from dataclasses import dataclass
|
6 |
from loguru import logger
|
7 |
from .supabase_db import SupabaseClient
|
|
|
30 |
settings = get_settings()
|
31 |
self.supabase_client = SupabaseClient(settings.supabase_url, settings.supabase_key)
|
32 |
self.embedding_client = EmbeddingClient()
|
33 |
+
self.llm_client: Optional[Any] = None
|
34 |
|
35 |
# Regex patterns cho các cấp độ cấu trúc - SỬA LẠI ĐỂ CHÍNH XÁC HƠN
|
36 |
# Đảm bảo mỗi pattern có đúng số group
|
|
|
418 |
|
419 |
for i, chunk in enumerate(chunks, 1):
|
420 |
try:
|
421 |
+
# Tạo embedding
|
422 |
+
embedding = await self.embedding_client.create_embedding(chunk.content, task_type="retrieval_document")
|
|
|
|
|
|
|
423 |
|
424 |
# Chuẩn bị data cho Supabase
|
425 |
chunk_dict = {
|
426 |
'id': chunk.id,
|
427 |
'content': chunk.content,
|
428 |
+
'embedding': embedding if embedding is not None else [0.0] * 768, # Sử dụng embedding thực tế nếu có
|
429 |
'vanbanid': chunk.vanbanid,
|
430 |
'cha': chunk.cha,
|
431 |
'document_title': chunk.document_title,
|
|
|
433 |
'article_title': chunk.article_title,
|
434 |
'clause_number': chunk.clause_number,
|
435 |
'sub_clause_letter': chunk.sub_clause_letter,
|
436 |
+
'context_summary': f"Structure: {chunk.context_summary}|Semantic: {chunk.content}"
|
437 |
}
|
438 |
|
439 |
# Lưu ngay lập tức vào Supabase
|
|
|
506 |
|
507 |
except Exception as e:
|
508 |
logger.error(f"[CHUNKER] Error processing document {document_id}: {e}") ##
|
509 |
+
return False
|
510 |
+
|
511 |
+
async def _create_semantic_summary_with_llm(self, chunk_content: str) -> str:
|
512 |
+
"""
|
513 |
+
Sinh semantic summary ngắn gọn, súc tích cho chunk bằng LLM.
|
514 |
+
"""
|
515 |
+
if not hasattr(self, "llm_client") or self.llm_client is None:
|
516 |
+
logger.warning("[CHUNKER] llm_client chưa được gán, bỏ qua semantic summary.")
|
517 |
+
return ""
|
518 |
+
prompt = (
|
519 |
+
"Tóm tắt thật ngắn gọn, súc tích nội dung luật sau (1-2 câu, không lặp lại tiêu đề, không giải thích):\n"
|
520 |
+
f"{chunk_content.strip()}"
|
521 |
+
)
|
522 |
+
try:
|
523 |
+
summary = await self.llm_client.generate_text(prompt)
|
524 |
+
return summary.strip() if summary else ""
|
525 |
+
except Exception as e:
|
526 |
+
logger.error(f"[CHUNKER] Lỗi khi sinh semantic summary bằng LLM: {e}")
|
527 |
+
return ""
|
app/main.py
CHANGED
@@ -77,6 +77,7 @@ reranker = Reranker()
|
|
77 |
|
78 |
# Khởi tạo LawDocumentChunker
|
79 |
law_chunker = LawDocumentChunker()
|
|
|
80 |
|
81 |
logger.info("[STARTUP] Mount health router...")
|
82 |
app.include_router(health_router)
|
|
|
77 |
|
78 |
# Khởi tạo LawDocumentChunker
|
79 |
law_chunker = LawDocumentChunker()
|
80 |
+
law_chunker.llm_client = llm_client
|
81 |
|
82 |
logger.info("[STARTUP] Mount health router...")
|
83 |
app.include_router(health_router)
|