File size: 3,456 Bytes
1b04b96
 
 
2d5dee0
1d3e5ce
fdc80c8
f78495c
 
 
2d5dee0
 
1d3e5ce
fdc80c8
 
1d3e5ce
 
fdc80c8
1d3e5ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f78495c
 
1d3e5ce
f78495c
1d3e5ce
1b04b96
fdc80c8
 
 
c14a20a
fdc80c8
 
1d3e5ce
fdc80c8
 
2d5dee0
fdc80c8
1b04b96
fdc80c8
1b04b96
fdc80c8
1b04b96
fdc80c8
1b04b96
 
 
 
 
 
 
 
 
 
99afa50
 
 
c14a20a
99afa50
c14a20a
99afa50
 
 
 
 
c14a20a
 
f78495c
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import numpy as np
from langchain.schema import Document
import faiss
from rank_bm25 import BM25Okapi
from data_processing import embedding_model
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

retrieved_docs = None

def retrieve_documents_hybrid(query, q_dataset, top_k=5):
    with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f:
        chunked_documents = json.load(f)  # Contains all documents for this dataset
    
    faiss_index_path = f"data_local/{q_dataset}_quantized.faiss"
    index = faiss.read_index(faiss_index_path)

    # Tokenize documents for BM25
    tokenized_docs = [doc.split() for doc in chunked_documents]
    bm25 = BM25Okapi(tokenized_docs)

    query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
    query_embedding = query_embedding.reshape(1, -1)

    # FAISS Search
    _, nearest_indices = index.search(query_embedding, top_k)
    faiss_docs = [chunked_documents[i] for i in nearest_indices[0]]

    # BM25 Search  
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
    bm25_docs = [chunked_documents[i] for i in bm25_top_indices]

    # Merge FAISS + BM25 Results
    retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
    
    reranked_docs = rerank_documents(query, retrieved_docs)

    return reranked_docs

# Retrieval Function
# def retrieve_documents(query, top_k=5):
#     query_dataset = find_query_dataset(query)
#     #index, chunk_docs = load_data_from_faiss(query)

#     with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
#         documents = json.load(f)  # Contains all documents for this dataset

#     faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
#     index = faiss.read_index(faiss_index_path)

#     query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)

#     _, nearest_indices = index.search(query_embedding, top_k)

#     retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]

#     return retrieved_docs

def remove_duplicate_documents(documents):
    unique_documents = []
    seen_documents = set()  # To keep track of seen documents
    for doc in documents:
        # Using the page_content as a unique identifier for deduplication
        doc_content = doc.page_content
        if doc_content not in seen_documents:
            unique_documents.append(doc)
            seen_documents.add(doc_content)
    return unique_documents

def find_query_dataset(query):
    index = faiss.read_index("data_local/question_quantized.faiss")

    with open("data_local/dataset_mapping.json", "r") as f:
        dataset_names = json.load(f)

    question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
    _, nearest_index = index.search(question_embedding, 1)  
    best_dataset = dataset_names[nearest_index[0][0]]
    return best_dataset

def rerank_documents(query, retrieved_docs):
    doc_texts = [doc for doc in retrieved_docs]
    scores = reranker.predict([[query, doc] for doc in doc_texts])
    ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
    return ranked_docs[:5]  # Return top k most relevant