23RAG7

Sleeping

File size: 3,508 Bytes

1b04b96
 
 
2d5dee0
1d3e5ce
7bb8fda
f78495c
 
 
2d5dee0
 
1b04b96
1d3e5ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f78495c
 
1d3e5ce
f78495c
1d3e5ce
1b04b96
 
c14a20a
 
 
1d3e5ce
 
 
861479a
2d5dee0
 
1b04b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99afa50
 
 
c14a20a
99afa50
c14a20a
99afa50
 
 
 
 
c14a20a
 
f78495c

import json
import numpy as np
from langchain.schema import Document
import faiss
from rank_bm25 import BM25Okapi
from data_processing import embedding_model #, index, actual_docs
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

retrieved_docs = None


def retrieve_documents_hybrid(query, top_k=5):
    query_dataset = find_query_dataset(query)
    
    with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
        chunked_documents = json.load(f)  # Contains all documents for this dataset
    
    faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
    index = faiss.read_index(faiss_index_path)

    # Tokenize documents for BM25
    tokenized_docs = [doc.split() for doc in chunked_documents]
    bm25 = BM25Okapi(tokenized_docs)

    query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
    query_embedding = query_embedding.reshape(1, -1)

    # FAISS Search
    _, nearest_indices = index.search(query_embedding, top_k)
    faiss_docs = [chunked_documents[i] for i in nearest_indices[0]]

    # BM25 Search  
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
    bm25_docs = [chunked_documents[i] for i in bm25_top_indices]

    # Merge FAISS + BM25 Results
    retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
    
    reranked_docs = rerank_documents(query, retrieved_docs)

    return reranked_docs

# Retrieval Function
def retrieve_documents(query, top_k=5):
    query_dataset = find_query_dataset(query)
    #index, chunk_docs = load_data_from_faiss(query)

    with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
        documents = json.load(f)  # Contains all documents for this dataset

    faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
    index = faiss.read_index(faiss_index_path)

    query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)

    _, nearest_indices = index.search(query_embedding, top_k)

    retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]

    return retrieved_docs

def remove_duplicate_documents(documents):
    unique_documents = []
    seen_documents = set()  # To keep track of seen documents
    for doc in documents:
        # Using the page_content as a unique identifier for deduplication
        doc_content = doc.page_content
        if doc_content not in seen_documents:
            unique_documents.append(doc)
            seen_documents.add(doc_content)
    return unique_documents

def find_query_dataset(query):
    index = faiss.read_index("data_local/question_quantized.faiss")

    with open("data_local/dataset_mapping.json", "r") as f:
        dataset_names = json.load(f)

    question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
    _, nearest_index = index.search(question_embedding, 1)  
    best_dataset = dataset_names[nearest_index[0][0]]
    return best_dataset

def rerank_documents(query, retrieved_docs):
    doc_texts = [doc for doc in retrieved_docs]
    scores = reranker.predict([[query, doc] for doc in doc_texts])
    ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
    return ranked_docs[:5]  # Return top k most relevant