Spaces:
Sleeping
Sleeping
File size: 3,456 Bytes
1b04b96 2d5dee0 1d3e5ce fdc80c8 f78495c 2d5dee0 1d3e5ce fdc80c8 1d3e5ce fdc80c8 1d3e5ce f78495c 1d3e5ce f78495c 1d3e5ce 1b04b96 fdc80c8 c14a20a fdc80c8 1d3e5ce fdc80c8 2d5dee0 fdc80c8 1b04b96 fdc80c8 1b04b96 fdc80c8 1b04b96 fdc80c8 1b04b96 99afa50 c14a20a 99afa50 c14a20a 99afa50 c14a20a f78495c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import json
import numpy as np
from langchain.schema import Document
import faiss
from rank_bm25 import BM25Okapi
from data_processing import embedding_model
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
retrieved_docs = None
def retrieve_documents_hybrid(query, q_dataset, top_k=5):
with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f:
chunked_documents = json.load(f) # Contains all documents for this dataset
faiss_index_path = f"data_local/{q_dataset}_quantized.faiss"
index = faiss.read_index(faiss_index_path)
# Tokenize documents for BM25
tokenized_docs = [doc.split() for doc in chunked_documents]
bm25 = BM25Okapi(tokenized_docs)
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
query_embedding = query_embedding.reshape(1, -1)
# FAISS Search
_, nearest_indices = index.search(query_embedding, top_k)
faiss_docs = [chunked_documents[i] for i in nearest_indices[0]]
# BM25 Search
tokenized_query = query.split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
bm25_docs = [chunked_documents[i] for i in bm25_top_indices]
# Merge FAISS + BM25 Results
retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
reranked_docs = rerank_documents(query, retrieved_docs)
return reranked_docs
# Retrieval Function
# def retrieve_documents(query, top_k=5):
# query_dataset = find_query_dataset(query)
# #index, chunk_docs = load_data_from_faiss(query)
# with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
# documents = json.load(f) # Contains all documents for this dataset
# faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
# index = faiss.read_index(faiss_index_path)
# query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
# _, nearest_indices = index.search(query_embedding, top_k)
# retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
# return retrieved_docs
def remove_duplicate_documents(documents):
unique_documents = []
seen_documents = set() # To keep track of seen documents
for doc in documents:
# Using the page_content as a unique identifier for deduplication
doc_content = doc.page_content
if doc_content not in seen_documents:
unique_documents.append(doc)
seen_documents.add(doc_content)
return unique_documents
def find_query_dataset(query):
index = faiss.read_index("data_local/question_quantized.faiss")
with open("data_local/dataset_mapping.json", "r") as f:
dataset_names = json.load(f)
question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
_, nearest_index = index.search(question_embedding, 1)
best_dataset = dataset_names[nearest_index[0][0]]
return best_dataset
def rerank_documents(query, retrieved_docs):
doc_texts = [doc for doc in retrieved_docs]
scores = reranker.predict([[query, doc] for doc in doc_texts])
ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
return ranked_docs[:5] # Return top k most relevant
|