VietCat commited on
Commit
34991da
·
1 Parent(s): 70d2f99

update remote

Browse files
Files changed (2) hide show
  1. app/law_document_chunker.py +1 -1
  2. app/reranker.py +107 -34
app/law_document_chunker.py CHANGED
@@ -186,7 +186,7 @@ class LawDocumentChunker:
186
  if parent.article_number and not metadata.article_number:
187
  metadata.article_number = parent.article_number
188
  if parent.article_title and not metadata.article_title:
189
- metadata.article_title = parent.article_title
190
  if parent.clause_number and not metadata.clause_number:
191
  metadata.clause_number = parent.clause_number
192
  if parent.sub_clause_letter and not metadata.sub_clause_letter:
 
186
  if parent.article_number and not metadata.article_number:
187
  metadata.article_number = parent.article_number
188
  if parent.article_title and not metadata.article_title:
189
+ metadata.article_title = parent.article_title #
190
  if parent.clause_number and not metadata.clause_number:
191
  metadata.clause_number = parent.clause_number
192
  if parent.sub_clause_letter and not metadata.sub_clause_letter:
app/reranker.py CHANGED
@@ -4,6 +4,7 @@ from .gemini_client import GeminiClient
4
  from loguru import logger
5
  import asyncio
6
  import random
 
7
  from .constants import BATCH_STATUS_MESSAGES
8
 
9
  class Reranker:
@@ -20,17 +21,26 @@ class Reranker:
20
  else:
21
  raise NotImplementedError(f"Rerank provider {self.provider} not supported yet.")
22
  self.facebook_client = facebook_client
 
 
 
 
 
 
 
 
 
23
 
24
  async def _score_doc(self, query: str, doc: Dict) -> Dict:
25
  """
26
  Score một document với query.
27
  """
28
  content = (doc.get('tieude', '') or '') + ' ' + (doc.get('noidung', '') or '')
 
29
  prompt = (
30
- f"Đoạn luật: {content}\n"
31
- f"Câu hỏi: {query}\n"
32
- "Hãy đánh giá mức độ liên quan giữa đoạn luật và câu hỏi trên thang điểm 0-10. "
33
- "Chỉ trả về một số duy nhất."
34
  )
35
 
36
  try:
@@ -51,53 +61,116 @@ class Reranker:
51
  doc['rerank_score'] = 0
52
  return doc
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  async def rerank(self, query: str, docs: List[Dict], top_k: int = 5) -> List[Dict]:
55
  """
56
  Rerank docs theo độ liên quan với query, trả về top_k docs.
57
- Sử dụng concurrency để process nhiều docs cùng lúc.
58
  """
59
  logger.info(f"[RERANK] Start rerank for query: {query} | docs: {len(docs)} | top_k: {top_k}")
60
 
61
  if not docs:
62
  return []
63
 
64
- # Rerank toàn bộ docs, không giới hạn 10 docs
65
- docs_to_rerank = docs
66
- logger.info(f"[RERANK] Will rerank {len(docs_to_rerank)} docs (no limit)")
 
 
 
67
 
68
- # Process docs với concurrency
69
- batch_size = 5 # Process 5 docs cùng lúc
70
- scored = []
 
71
 
72
- for i in range(0, len(docs_to_rerank), batch_size):
73
- batch = docs_to_rerank[i:i + batch_size]
74
- logger.info(f"[RERANK] Processing batch {i//batch_size + 1}: {len(batch)} docs")
75
-
76
- # Tạo tasks cho batch hiện tại
77
- tasks = [self._score_doc(query, doc) for doc in batch]
78
-
79
- # Chạy batch concurrently
80
- batch_results = await asyncio.gather(*tasks, return_exceptions=True)
81
-
82
- # Xử lý kết quả
83
- for result in batch_results:
84
- if isinstance(result, Exception):
85
- logger.error(f"[RERANK] Batch processing error: {result}")
86
- continue
87
- scored.append(result)
88
-
89
- logger.info(f"[RERANK] Completed batch {i//batch_size + 1}, processed {len(scored)} docs so far")
90
- # Send Facebook message after each batch
91
- if self.facebook_client:
92
  try:
93
- message = random.choice(BATCH_STATUS_MESSAGES)
94
- await self.facebook_client.send_message(message=f"... {message} ...")
95
  except Exception as e:
96
- logger.error(f"[RERANK][FACEBOOK] Error sending batch message: {e}")
 
 
 
 
 
 
 
 
 
 
97
 
98
  # Sort theo score và trả về top_k
99
  scored = sorted(scored, key=lambda x: x['rerank_score'], reverse=True)
100
  result = scored[:top_k]
101
 
 
 
 
 
 
 
 
 
102
  logger.info(f"[RERANK] Top reranked docs: {result}")
103
  return result
 
4
  from loguru import logger
5
  import asyncio
6
  import random
7
+ import hashlib
8
  from .constants import BATCH_STATUS_MESSAGES
9
 
10
  class Reranker:
 
21
  else:
22
  raise NotImplementedError(f"Rerank provider {self.provider} not supported yet.")
23
  self.facebook_client = facebook_client
24
+ # Cache cho kết quả reranking
25
+ self._rerank_cache = {}
26
+
27
+ def _get_cache_key(self, query: str, docs: List[Dict]) -> str:
28
+ """Tạo cache key từ query và docs."""
29
+ # Tạo hash từ query và doc IDs
30
+ doc_ids = [str(doc.get('id', '')) for doc in docs[:15]] # Chỉ cache top 15 docs
31
+ cache_content = query + "|".join(doc_ids)
32
+ return hashlib.md5(cache_content.encode()).hexdigest()
33
 
34
  async def _score_doc(self, query: str, doc: Dict) -> Dict:
35
  """
36
  Score một document với query.
37
  """
38
  content = (doc.get('tieude', '') or '') + ' ' + (doc.get('noidung', '') or '')
39
+ # Tối ưu prompt ngắn gọn hơn
40
  prompt = (
41
+ f"Luật: {content[:500]}\n" # Giới hạn content length
42
+ f"Hỏi: {query}\n"
43
+ "Đánh giá mức độ liên quan (0-10). Chỉ trả về số."
 
44
  )
45
 
46
  try:
 
61
  doc['rerank_score'] = 0
62
  return doc
63
 
64
+ async def _batch_score_docs(self, query: str, docs: List[Dict]) -> List[Dict]:
65
+ """
66
+ Score nhiều documents cùng lúc bằng một prompt duy nhất.
67
+ """
68
+ if not docs:
69
+ return []
70
+
71
+ # Tạo prompt batch cho tất cả documents
72
+ docs_content = []
73
+ for i, doc in enumerate(docs):
74
+ content = (doc.get('tieude', '') or '') + ' ' + (doc.get('noidung', '') or '')
75
+ docs_content.append(f"{i+1}. {content[:300]}") # Giới hạn length
76
+
77
+ batch_prompt = (
78
+ f"Câu hỏi: {query}\n\n"
79
+ f"Các đoạn luật:\n" + "\n".join(docs_content) + "\n\n"
80
+ f"Đánh giá mức độ liên quan của từng đoạn (0-10). Trả về dạng: 1.8,2.5,3.0,..."
81
+ )
82
+
83
+ try:
84
+ if self.provider == 'gemini':
85
+ loop = asyncio.get_event_loop()
86
+ logger.info(f"[RERANK] Sending batch prompt to Gemini")
87
+ response = await loop.run_in_executor(None, self.client.generate_text, batch_prompt)
88
+ logger.info(f"[RERANK] Got batch scores from Gemini: {response}")
89
+
90
+ # Parse scores từ response
91
+ scores_text = str(response).strip()
92
+ scores = []
93
+ for score_str in scores_text.split(','):
94
+ try:
95
+ score = float(score_str.strip().split('.')[0])
96
+ scores.append(score)
97
+ except:
98
+ scores.append(0)
99
+
100
+ # Gán scores cho documents
101
+ for i, doc in enumerate(docs):
102
+ doc['rerank_score'] = scores[i] if i < len(scores) else 0
103
+
104
+ return docs
105
+
106
+ else:
107
+ raise NotImplementedError(f"Rerank provider {self.provider} not supported yet in batch method.")
108
+
109
+ except Exception as e:
110
+ logger.error(f"[RERANK] Lỗi khi batch score: {e}")
111
+ # Fallback về individual scoring
112
+ for doc in docs:
113
+ doc['rerank_score'] = 0
114
+ return docs
115
+
116
  async def rerank(self, query: str, docs: List[Dict], top_k: int = 5) -> List[Dict]:
117
  """
118
  Rerank docs theo độ liên quan với query, trả về top_k docs.
119
+ Sử dụng batch processing để tối ưu hiệu suất.
120
  """
121
  logger.info(f"[RERANK] Start rerank for query: {query} | docs: {len(docs)} | top_k: {top_k}")
122
 
123
  if not docs:
124
  return []
125
 
126
+ # Kiểm tra cache trước
127
+ cache_key = self._get_cache_key(query, docs)
128
+ if cache_key in self._rerank_cache:
129
+ logger.info(f"[RERANK] Cache hit for query, returning cached result")
130
+ cached_result = self._rerank_cache[cache_key][:top_k]
131
+ return cached_result
132
 
133
+ # Giới hạn số lượng docs để rerank - chỉ rerank top 15 docs có similarity cao nhất
134
+ max_docs_to_rerank = 15
135
+ docs_to_rerank = docs[:max_docs_to_rerank]
136
+ logger.info(f"[RERANK] Will rerank {len(docs_to_rerank)} docs (limited to top {max_docs_to_rerank})")
137
 
138
+ # Sử dụng batch processing thay vì individual scoring
139
+ try:
140
+ scored = await self._batch_score_docs(query, docs_to_rerank)
141
+ logger.info(f"[RERANK] Batch processing completed, scored {len(scored)} docs")
142
+ except Exception as e:
143
+ logger.error(f"[RERANK] Batch processing failed, falling back to individual scoring: {e}")
144
+ # Fallback về individual scoring nếu batch processing thất bại
145
+ scored = []
146
+ for doc in docs_to_rerank:
 
 
 
 
 
 
 
 
 
 
 
147
  try:
148
+ scored_doc = await self._score_doc(query, doc)
149
+ scored.append(scored_doc)
150
  except Exception as e:
151
+ logger.error(f"[RERANK] Error scoring individual doc: {e}")
152
+ doc['rerank_score'] = 0
153
+ scored.append(doc)
154
+
155
+ # Gửi Facebook message chỉ một lần sau khi hoàn thành
156
+ if self.facebook_client:
157
+ try:
158
+ message = random.choice(BATCH_STATUS_MESSAGES)
159
+ await self.facebook_client.send_message(message=f"... {message} ...")
160
+ except Exception as e:
161
+ logger.error(f"[RERANK][FACEBOOK] Error sending batch message: {e}")
162
 
163
  # Sort theo score và trả về top_k
164
  scored = sorted(scored, key=lambda x: x['rerank_score'], reverse=True)
165
  result = scored[:top_k]
166
 
167
+ # Cache kết quả
168
+ self._rerank_cache[cache_key] = scored
169
+ # Giới hạn cache size để tránh memory leak
170
+ if len(self._rerank_cache) > 100:
171
+ # Xóa cache cũ nhất
172
+ oldest_key = next(iter(self._rerank_cache))
173
+ del self._rerank_cache[oldest_key]
174
+
175
  logger.info(f"[RERANK] Top reranked docs: {result}")
176
  return result