import json import numpy as np from langchain.schema import Document import faiss from rank_bm25 import BM25Okapi from data_processing import embedding_model from sentence_transformers import CrossEncoder reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") retrieved_docs = None def retrieve_documents_hybrid(query, q_dataset, top_k=5): with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f: chunked_documents = json.load(f) # Contains all documents for this dataset faiss_index_path = f"data_local/{q_dataset}_quantized.faiss" index = faiss.read_index(faiss_index_path) # Tokenize documents for BM25 tokenized_docs = [doc.split() for doc in chunked_documents] bm25 = BM25Okapi(tokenized_docs) query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32) query_embedding = query_embedding.reshape(1, -1) # FAISS Search _, nearest_indices = index.search(query_embedding, top_k) faiss_docs = [chunked_documents[i] for i in nearest_indices[0]] # BM25 Search tokenized_query = query.split() bm25_scores = bm25.get_scores(tokenized_query) bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k] bm25_docs = [chunked_documents[i] for i in bm25_top_indices] # Merge FAISS + BM25 Results retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k] reranked_docs = rerank_documents(query, retrieved_docs) return reranked_docs # Retrieval Function # def retrieve_documents(query, top_k=5): # query_dataset = find_query_dataset(query) # #index, chunk_docs = load_data_from_faiss(query) # with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f: # documents = json.load(f) # Contains all documents for this dataset # faiss_index_path = f"data_local/{query_dataset}_quantized.faiss" # index = faiss.read_index(faiss_index_path) # query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32) # _, nearest_indices = index.search(query_embedding, top_k) # retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]] # return retrieved_docs def remove_duplicate_documents(documents): unique_documents = [] seen_documents = set() # To keep track of seen documents for doc in documents: # Using the page_content as a unique identifier for deduplication doc_content = doc.page_content if doc_content not in seen_documents: unique_documents.append(doc) seen_documents.add(doc_content) return unique_documents def find_query_dataset(query): index = faiss.read_index("data_local/question_quantized.faiss") with open("data_local/dataset_mapping.json", "r") as f: dataset_names = json.load(f) question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32) _, nearest_index = index.search(question_embedding, 1) best_dataset = dataset_names[nearest_index[0][0]] return best_dataset def rerank_documents(query, retrieved_docs): doc_texts = [doc for doc in retrieved_docs] scores = reranker.predict([[query, doc] for doc in doc_texts]) ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)] return ranked_docs[:5] # Return top k most relevant