cb1716pics commited on
Commit
c14a20a
·
verified ·
1 Parent(s): 99afa50

Upload 5 files

Browse files
Files changed (1) hide show
  1. retrieval.py +10 -7
retrieval.py CHANGED
@@ -3,27 +3,29 @@ import numpy as np
3
  from langchain.schema import Document
4
  import faiss
5
 
6
- from data_processing import embedding_model #, index, actual_docs
7
 
8
  retrieved_docs = None
9
 
10
  # Retrieval Function
11
  def retrieve_documents(query, top_k=5):
12
- faiss_index_path = f"data_local/rag7_index.faiss"
 
 
 
13
  index = faiss.read_index(faiss_index_path)
14
 
15
  query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
16
 
17
  _, nearest_indices = index.search(query_embedding, top_k)
18
 
19
- with open(f"data_local/rag7_docs.json", "r") as f:
20
  documents = json.load(f) # Contains all documents for this dataset
21
 
22
  retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
23
 
24
  return retrieved_docs
25
 
26
-
27
  def remove_duplicate_documents(documents):
28
  unique_documents = []
29
  seen_documents = set() # To keep track of seen documents
@@ -36,12 +38,13 @@ def remove_duplicate_documents(documents):
36
  return unique_documents
37
 
38
  def find_query_dataset(query):
39
- index = faiss.read_index("question_index.faiss")
40
 
41
- with open("dataset_mapping.json", "r") as f:
42
  dataset_names = json.load(f)
43
 
44
  question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
45
  _, nearest_index = index.search(question_embedding, 1)
46
  best_dataset = dataset_names[nearest_index[0][0]]
47
- return best_dataset
 
 
3
  from langchain.schema import Document
4
  import faiss
5
 
6
+ from data_processing import embedding_model , index, actual_docs
7
 
8
  retrieved_docs = None
9
 
10
  # Retrieval Function
11
  def retrieve_documents(query, top_k=5):
12
+ query_dataset = find_query_dataset(query)
13
+ #index, chunk_docs = load_data_from_faiss(query)
14
+
15
+ faiss_index_path = f"data_local/{query_dataset}__quantized.faiss"
16
  index = faiss.read_index(faiss_index_path)
17
 
18
  query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
19
 
20
  _, nearest_indices = index.search(query_embedding, top_k)
21
 
22
+ with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
23
  documents = json.load(f) # Contains all documents for this dataset
24
 
25
  retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
26
 
27
  return retrieved_docs
28
 
 
29
  def remove_duplicate_documents(documents):
30
  unique_documents = []
31
  seen_documents = set() # To keep track of seen documents
 
38
  return unique_documents
39
 
40
  def find_query_dataset(query):
41
+ index = faiss.read_index("data_local/question_quantized.faiss")
42
 
43
+ with open("data_local/dataset_mapping.json", "r") as f:
44
  dataset_names = json.load(f)
45
 
46
  question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
47
  _, nearest_index = index.search(question_embedding, 1)
48
  best_dataset = dataset_names[nearest_index[0][0]]
49
+ return best_dataset
50
+