Update main/chatbot.py
Browse files- main/chatbot.py +29 -5
main/chatbot.py
CHANGED
@@ -144,16 +144,20 @@ class VietnameseLegalRAG:
|
|
144 |
query, top_k=vector_top_k
|
145 |
)
|
146 |
|
147 |
-
# Combine and deduplicate results
|
148 |
all_docs = {}
|
149 |
|
150 |
-
# Add BM25 results
|
151 |
for doc in bm25_results:
|
152 |
doc_id = doc.get('id', '')
|
153 |
if doc_id:
|
154 |
-
all_docs[doc_id] = {
|
|
|
|
|
|
|
|
|
155 |
|
156 |
-
# Add vector results
|
157 |
for doc in vector_results:
|
158 |
doc_id = doc.get('id', '')
|
159 |
if doc_id:
|
@@ -161,11 +165,23 @@ class VietnameseLegalRAG:
|
|
161 |
# Combine scores if document found by both methods
|
162 |
all_docs[doc_id]['retrieval_method'] = 'hybrid'
|
163 |
all_docs[doc_id]['vector_score'] = doc.get('score', 0)
|
|
|
|
|
|
|
|
|
|
|
164 |
else:
|
165 |
-
all_docs[doc_id] = {
|
|
|
|
|
|
|
|
|
166 |
|
167 |
retrieved_docs = list(all_docs.values())
|
168 |
|
|
|
|
|
|
|
169 |
elif self.vector_store:
|
170 |
# Vector search only
|
171 |
retrieved_docs = self.vector_store.search_similar_documents(query, top_k=vector_top_k)
|
@@ -224,6 +240,14 @@ class VietnameseLegalRAG:
|
|
224 |
|
225 |
else:
|
226 |
print("No documents found with sufficient similarity scores")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
return []
|
228 |
else:
|
229 |
# No documents retrieved
|
|
|
144 |
query, top_k=vector_top_k
|
145 |
)
|
146 |
|
147 |
+
# Combine and deduplicate results with better scoring
|
148 |
all_docs = {}
|
149 |
|
150 |
+
# Add BM25 results with proper scoring
|
151 |
for doc in bm25_results:
|
152 |
doc_id = doc.get('id', '')
|
153 |
if doc_id:
|
154 |
+
all_docs[doc_id] = {
|
155 |
+
**doc,
|
156 |
+
'retrieval_method': 'bm25',
|
157 |
+
'bm25_score': doc.get('score', 0)
|
158 |
+
}
|
159 |
|
160 |
+
# Add vector results with proper scoring
|
161 |
for doc in vector_results:
|
162 |
doc_id = doc.get('id', '')
|
163 |
if doc_id:
|
|
|
165 |
# Combine scores if document found by both methods
|
166 |
all_docs[doc_id]['retrieval_method'] = 'hybrid'
|
167 |
all_docs[doc_id]['vector_score'] = doc.get('score', 0)
|
168 |
+
# Use higher score as main score for now
|
169 |
+
all_docs[doc_id]['score'] = max(
|
170 |
+
all_docs[doc_id].get('bm25_score', 0),
|
171 |
+
doc.get('score', 0)
|
172 |
+
)
|
173 |
else:
|
174 |
+
all_docs[doc_id] = {
|
175 |
+
**doc,
|
176 |
+
'retrieval_method': 'vector',
|
177 |
+
'vector_score': doc.get('score', 0)
|
178 |
+
}
|
179 |
|
180 |
retrieved_docs = list(all_docs.values())
|
181 |
|
182 |
+
# Sort by score for better ranking
|
183 |
+
retrieved_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
|
184 |
+
|
185 |
elif self.vector_store:
|
186 |
# Vector search only
|
187 |
retrieved_docs = self.vector_store.search_similar_documents(query, top_k=vector_top_k)
|
|
|
240 |
|
241 |
else:
|
242 |
print("No documents found with sufficient similarity scores")
|
243 |
+
# Fallback: return best available documents anyway (with lower threshold)
|
244 |
+
if retrieved_docs:
|
245 |
+
print(f"Fallback: returning top {min(5, len(retrieved_docs))} documents with best scores")
|
246 |
+
# Sort by score and return best ones
|
247 |
+
retrieved_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
|
248 |
+
fallback_docs = retrieved_docs[:min(5, len(retrieved_docs))]
|
249 |
+
print([(fallback_doc['id'], fallback_doc['score']) for fallback_doc in fallback_docs])
|
250 |
+
return fallback_docs
|
251 |
return []
|
252 |
else:
|
253 |
# No documents retrieved
|