Spaces:
Sleeping
Sleeping
File size: 3,508 Bytes
1b04b96 2d5dee0 1d3e5ce 7bb8fda f78495c 2d5dee0 1b04b96 1d3e5ce f78495c 1d3e5ce f78495c 1d3e5ce 1b04b96 c14a20a 1d3e5ce 861479a 2d5dee0 1b04b96 99afa50 c14a20a 99afa50 c14a20a 99afa50 c14a20a f78495c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import json
import numpy as np
from langchain.schema import Document
import faiss
from rank_bm25 import BM25Okapi
from data_processing import embedding_model #, index, actual_docs
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
retrieved_docs = None
def retrieve_documents_hybrid(query, top_k=5):
query_dataset = find_query_dataset(query)
with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
chunked_documents = json.load(f) # Contains all documents for this dataset
faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
index = faiss.read_index(faiss_index_path)
# Tokenize documents for BM25
tokenized_docs = [doc.split() for doc in chunked_documents]
bm25 = BM25Okapi(tokenized_docs)
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
query_embedding = query_embedding.reshape(1, -1)
# FAISS Search
_, nearest_indices = index.search(query_embedding, top_k)
faiss_docs = [chunked_documents[i] for i in nearest_indices[0]]
# BM25 Search
tokenized_query = query.split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
bm25_docs = [chunked_documents[i] for i in bm25_top_indices]
# Merge FAISS + BM25 Results
retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
reranked_docs = rerank_documents(query, retrieved_docs)
return reranked_docs
# Retrieval Function
def retrieve_documents(query, top_k=5):
query_dataset = find_query_dataset(query)
#index, chunk_docs = load_data_from_faiss(query)
with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
documents = json.load(f) # Contains all documents for this dataset
faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
index = faiss.read_index(faiss_index_path)
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
_, nearest_indices = index.search(query_embedding, top_k)
retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
return retrieved_docs
def remove_duplicate_documents(documents):
unique_documents = []
seen_documents = set() # To keep track of seen documents
for doc in documents:
# Using the page_content as a unique identifier for deduplication
doc_content = doc.page_content
if doc_content not in seen_documents:
unique_documents.append(doc)
seen_documents.add(doc_content)
return unique_documents
def find_query_dataset(query):
index = faiss.read_index("data_local/question_quantized.faiss")
with open("data_local/dataset_mapping.json", "r") as f:
dataset_names = json.load(f)
question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
_, nearest_index = index.search(question_embedding, 1)
best_dataset = dataset_names[nearest_index[0][0]]
return best_dataset
def rerank_documents(query, retrieved_docs):
doc_texts = [doc for doc in retrieved_docs]
scores = reranker.predict([[query, doc] for doc in doc_texts])
ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
return ranked_docs[:5] # Return top k most relevant
|