VietCat commited on
Commit
6723e05
·
1 Parent(s): 329eb9f

fix metadata

Browse files
app/gemini_client.py CHANGED
@@ -103,7 +103,7 @@ class GeminiClient:
103
  logger.error(f"[GEMINI] Error counting tokens: {e}")
104
  return 0
105
 
106
- def create_embedding(self, text: str, model: Optional[str] = None) -> list:
107
  last_error = None
108
  max_retries = 3
109
 
@@ -117,13 +117,13 @@ class GeminiClient:
117
  if not use_model:
118
  raise ValueError("No model specified for embedding")
119
 
120
- logger.info(f"[GEMINI][EMBEDDING] Using model={use_model} (requested={model}, default={default_model})")
121
 
122
  configure(api_key=key)
123
  response = embed_content(
124
  model=use_model,
125
  content=text,
126
- task_type="retrieval_query"
127
  )
128
 
129
  self.limit_manager.log_request(key, use_model, success=True)
 
103
  logger.error(f"[GEMINI] Error counting tokens: {e}")
104
  return 0
105
 
106
+ def create_embedding(self, text: str, model: Optional[str] = None, task_type: str = "retrieval_query") -> list:
107
  last_error = None
108
  max_retries = 3
109
 
 
117
  if not use_model:
118
  raise ValueError("No model specified for embedding")
119
 
120
+ logger.info(f"[GEMINI][EMBEDDING] Using model={use_model} (requested={model}, default={default_model}), task_type={task_type}")
121
 
122
  configure(api_key=key)
123
  response = embed_content(
124
  model=use_model,
125
  content=text,
126
+ task_type=task_type
127
  )
128
 
129
  self.limit_manager.log_request(key, use_model, success=True)
app/law_document_chunker.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  import os
3
  import uuid
4
- from typing import List, Dict, Optional, Tuple
5
  from dataclasses import dataclass
6
  from loguru import logger
7
  from .supabase_db import SupabaseClient
@@ -30,6 +30,7 @@ class LawDocumentChunker:
30
  settings = get_settings()
31
  self.supabase_client = SupabaseClient(settings.supabase_url, settings.supabase_key)
32
  self.embedding_client = EmbeddingClient()
 
33
 
34
  # Regex patterns cho các cấp độ cấu trúc - SỬA LẠI ĐỂ CHÍNH XÁC HƠN
35
  # Đảm bảo mỗi pattern có đúng số group
@@ -417,17 +418,14 @@ class LawDocumentChunker:
417
 
418
  for i, chunk in enumerate(chunks, 1):
419
  try:
420
- # # Tạo embedding
421
- # embedding = await self.embedding_client.create_embedding(chunk.content)
422
-
423
- # # Tạo context_summary bằng LLM
424
- # context_summary = await self._create_context_summary_with_llm(chunk.content, chunk)
425
 
426
  # Chuẩn bị data cho Supabase
427
  chunk_dict = {
428
  'id': chunk.id,
429
  'content': chunk.content,
430
- 'embedding': [0.0] * 768, # Dummy embedding vector 768 chiều cho Supabase
431
  'vanbanid': chunk.vanbanid,
432
  'cha': chunk.cha,
433
  'document_title': chunk.document_title,
@@ -435,7 +433,7 @@ class LawDocumentChunker:
435
  'article_title': chunk.article_title,
436
  'clause_number': chunk.clause_number,
437
  'sub_clause_letter': chunk.sub_clause_letter,
438
- 'context_summary': f"Structure: Test | Semantic: Test" # Test context_summary
439
  }
440
 
441
  # Lưu ngay lập tức vào Supabase
@@ -508,4 +506,22 @@ class LawDocumentChunker:
508
 
509
  except Exception as e:
510
  logger.error(f"[CHUNKER] Error processing document {document_id}: {e}") ##
511
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import os
3
  import uuid
4
+ from typing import List, Dict, Optional, Tuple, Any
5
  from dataclasses import dataclass
6
  from loguru import logger
7
  from .supabase_db import SupabaseClient
 
30
  settings = get_settings()
31
  self.supabase_client = SupabaseClient(settings.supabase_url, settings.supabase_key)
32
  self.embedding_client = EmbeddingClient()
33
+ self.llm_client: Optional[Any] = None
34
 
35
  # Regex patterns cho các cấp độ cấu trúc - SỬA LẠI ĐỂ CHÍNH XÁC HƠN
36
  # Đảm bảo mỗi pattern có đúng số group
 
418
 
419
  for i, chunk in enumerate(chunks, 1):
420
  try:
421
+ # Tạo embedding
422
+ embedding = await self.embedding_client.create_embedding(chunk.content, task_type="retrieval_document")
 
 
 
423
 
424
  # Chuẩn bị data cho Supabase
425
  chunk_dict = {
426
  'id': chunk.id,
427
  'content': chunk.content,
428
+ 'embedding': embedding if embedding is not None else [0.0] * 768, # Sử dụng embedding thực tế nếu
429
  'vanbanid': chunk.vanbanid,
430
  'cha': chunk.cha,
431
  'document_title': chunk.document_title,
 
433
  'article_title': chunk.article_title,
434
  'clause_number': chunk.clause_number,
435
  'sub_clause_letter': chunk.sub_clause_letter,
436
+ 'context_summary': f"Structure: {chunk.context_summary}|Semantic: {chunk.content}"
437
  }
438
 
439
  # Lưu ngay lập tức vào Supabase
 
506
 
507
  except Exception as e:
508
  logger.error(f"[CHUNKER] Error processing document {document_id}: {e}") ##
509
+ return False
510
+
511
+ async def _create_semantic_summary_with_llm(self, chunk_content: str) -> str:
512
+ """
513
+ Sinh semantic summary ngắn gọn, súc tích cho chunk bằng LLM.
514
+ """
515
+ if not hasattr(self, "llm_client") or self.llm_client is None:
516
+ logger.warning("[CHUNKER] llm_client chưa được gán, bỏ qua semantic summary.")
517
+ return ""
518
+ prompt = (
519
+ "Tóm tắt thật ngắn gọn, súc tích nội dung luật sau (1-2 câu, không lặp lại tiêu đề, không giải thích):\n"
520
+ f"{chunk_content.strip()}"
521
+ )
522
+ try:
523
+ summary = await self.llm_client.generate_text(prompt)
524
+ return summary.strip() if summary else ""
525
+ except Exception as e:
526
+ logger.error(f"[CHUNKER] Lỗi khi sinh semantic summary bằng LLM: {e}")
527
+ return ""
app/main.py CHANGED
@@ -77,6 +77,7 @@ reranker = Reranker()
77
 
78
  # Khởi tạo LawDocumentChunker
79
  law_chunker = LawDocumentChunker()
 
80
 
81
  logger.info("[STARTUP] Mount health router...")
82
  app.include_router(health_router)
 
77
 
78
  # Khởi tạo LawDocumentChunker
79
  law_chunker = LawDocumentChunker()
80
+ law_chunker.llm_client = llm_client
81
 
82
  logger.info("[STARTUP] Mount health router...")
83
  app.include_router(health_router)