Spaces:
Sleeping
Sleeping
File size: 1,202 Bytes
1b04b96 2d5dee0 1b04b96 2d5dee0 1b04b96 6c9babd 2d5dee0 1b04b96 6c9babd 1b04b96 2d5dee0 1b04b96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import json
import numpy as np
from langchain.schema import Document
import faiss
from data_processing import embedding_model #, index, actual_docs
retrieved_docs = None
# Retrieval Function
def retrieve_documents(query, top_k=5):
faiss_index_path = f"data_local/rag7_index.faiss"
index = faiss.read_index(faiss_index_path)
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
_, nearest_indices = index.search(query_embedding, top_k)
with open(f"data_local/rag7_docs.json", "r") as f:
documents = json.load(f) # Contains all documents for this dataset
retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
return retrieved_docs
def remove_duplicate_documents(documents):
unique_documents = []
seen_documents = set() # To keep track of seen documents
for doc in documents:
# Using the page_content as a unique identifier for deduplication
doc_content = doc.page_content
if doc_content not in seen_documents:
unique_documents.append(doc)
seen_documents.add(doc_content)
return unique_documents |