fisherman611 commited on
Commit
e62c1c2
·
verified ·
1 Parent(s): 73fec0e

Update main/chatbot.py

Browse files
Files changed (1) hide show
  1. main/chatbot.py +29 -5
main/chatbot.py CHANGED
@@ -144,16 +144,20 @@ class VietnameseLegalRAG:
144
  query, top_k=vector_top_k
145
  )
146
 
147
- # Combine and deduplicate results
148
  all_docs = {}
149
 
150
- # Add BM25 results
151
  for doc in bm25_results:
152
  doc_id = doc.get('id', '')
153
  if doc_id:
154
- all_docs[doc_id] = {**doc, 'retrieval_method': 'bm25'}
 
 
 
 
155
 
156
- # Add vector results
157
  for doc in vector_results:
158
  doc_id = doc.get('id', '')
159
  if doc_id:
@@ -161,11 +165,23 @@ class VietnameseLegalRAG:
161
  # Combine scores if document found by both methods
162
  all_docs[doc_id]['retrieval_method'] = 'hybrid'
163
  all_docs[doc_id]['vector_score'] = doc.get('score', 0)
 
 
 
 
 
164
  else:
165
- all_docs[doc_id] = {**doc, 'retrieval_method': 'vector'}
 
 
 
 
166
 
167
  retrieved_docs = list(all_docs.values())
168
 
 
 
 
169
  elif self.vector_store:
170
  # Vector search only
171
  retrieved_docs = self.vector_store.search_similar_documents(query, top_k=vector_top_k)
@@ -224,6 +240,14 @@ class VietnameseLegalRAG:
224
 
225
  else:
226
  print("No documents found with sufficient similarity scores")
 
 
 
 
 
 
 
 
227
  return []
228
  else:
229
  # No documents retrieved
 
144
  query, top_k=vector_top_k
145
  )
146
 
147
+ # Combine and deduplicate results with better scoring
148
  all_docs = {}
149
 
150
+ # Add BM25 results with proper scoring
151
  for doc in bm25_results:
152
  doc_id = doc.get('id', '')
153
  if doc_id:
154
+ all_docs[doc_id] = {
155
+ **doc,
156
+ 'retrieval_method': 'bm25',
157
+ 'bm25_score': doc.get('score', 0)
158
+ }
159
 
160
+ # Add vector results with proper scoring
161
  for doc in vector_results:
162
  doc_id = doc.get('id', '')
163
  if doc_id:
 
165
  # Combine scores if document found by both methods
166
  all_docs[doc_id]['retrieval_method'] = 'hybrid'
167
  all_docs[doc_id]['vector_score'] = doc.get('score', 0)
168
+ # Use higher score as main score for now
169
+ all_docs[doc_id]['score'] = max(
170
+ all_docs[doc_id].get('bm25_score', 0),
171
+ doc.get('score', 0)
172
+ )
173
  else:
174
+ all_docs[doc_id] = {
175
+ **doc,
176
+ 'retrieval_method': 'vector',
177
+ 'vector_score': doc.get('score', 0)
178
+ }
179
 
180
  retrieved_docs = list(all_docs.values())
181
 
182
+ # Sort by score for better ranking
183
+ retrieved_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
184
+
185
  elif self.vector_store:
186
  # Vector search only
187
  retrieved_docs = self.vector_store.search_similar_documents(query, top_k=vector_top_k)
 
240
 
241
  else:
242
  print("No documents found with sufficient similarity scores")
243
+ # Fallback: return best available documents anyway (with lower threshold)
244
+ if retrieved_docs:
245
+ print(f"Fallback: returning top {min(5, len(retrieved_docs))} documents with best scores")
246
+ # Sort by score and return best ones
247
+ retrieved_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
248
+ fallback_docs = retrieved_docs[:min(5, len(retrieved_docs))]
249
+ print([(fallback_doc['id'], fallback_doc['score']) for fallback_doc in fallback_docs])
250
+ return fallback_docs
251
  return []
252
  else:
253
  # No documents retrieved