Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- retrieval.py +10 -7
retrieval.py
CHANGED
@@ -3,27 +3,29 @@ import numpy as np
|
|
3 |
from langchain.schema import Document
|
4 |
import faiss
|
5 |
|
6 |
-
from data_processing import embedding_model
|
7 |
|
8 |
retrieved_docs = None
|
9 |
|
10 |
# Retrieval Function
|
11 |
def retrieve_documents(query, top_k=5):
|
12 |
-
|
|
|
|
|
|
|
13 |
index = faiss.read_index(faiss_index_path)
|
14 |
|
15 |
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
16 |
|
17 |
_, nearest_indices = index.search(query_embedding, top_k)
|
18 |
|
19 |
-
with open(f"data_local/
|
20 |
documents = json.load(f) # Contains all documents for this dataset
|
21 |
|
22 |
retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
|
23 |
|
24 |
return retrieved_docs
|
25 |
|
26 |
-
|
27 |
def remove_duplicate_documents(documents):
|
28 |
unique_documents = []
|
29 |
seen_documents = set() # To keep track of seen documents
|
@@ -36,12 +38,13 @@ def remove_duplicate_documents(documents):
|
|
36 |
return unique_documents
|
37 |
|
38 |
def find_query_dataset(query):
|
39 |
-
index = faiss.read_index("
|
40 |
|
41 |
-
with open("dataset_mapping.json", "r") as f:
|
42 |
dataset_names = json.load(f)
|
43 |
|
44 |
question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
45 |
_, nearest_index = index.search(question_embedding, 1)
|
46 |
best_dataset = dataset_names[nearest_index[0][0]]
|
47 |
-
return best_dataset
|
|
|
|
3 |
from langchain.schema import Document
|
4 |
import faiss
|
5 |
|
6 |
+
from data_processing import embedding_model , index, actual_docs
|
7 |
|
8 |
retrieved_docs = None
|
9 |
|
10 |
# Retrieval Function
|
11 |
def retrieve_documents(query, top_k=5):
|
12 |
+
query_dataset = find_query_dataset(query)
|
13 |
+
#index, chunk_docs = load_data_from_faiss(query)
|
14 |
+
|
15 |
+
faiss_index_path = f"data_local/{query_dataset}__quantized.faiss"
|
16 |
index = faiss.read_index(faiss_index_path)
|
17 |
|
18 |
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
19 |
|
20 |
_, nearest_indices = index.search(query_embedding, top_k)
|
21 |
|
22 |
+
with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
|
23 |
documents = json.load(f) # Contains all documents for this dataset
|
24 |
|
25 |
retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
|
26 |
|
27 |
return retrieved_docs
|
28 |
|
|
|
29 |
def remove_duplicate_documents(documents):
|
30 |
unique_documents = []
|
31 |
seen_documents = set() # To keep track of seen documents
|
|
|
38 |
return unique_documents
|
39 |
|
40 |
def find_query_dataset(query):
|
41 |
+
index = faiss.read_index("data_local/question_quantized.faiss")
|
42 |
|
43 |
+
with open("data_local/dataset_mapping.json", "r") as f:
|
44 |
dataset_names = json.load(f)
|
45 |
|
46 |
question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
47 |
_, nearest_index = index.search(question_embedding, 1)
|
48 |
best_dataset = dataset_names[nearest_index[0][0]]
|
49 |
+
return best_dataset
|
50 |
+
|