Spaces:

cb1716pics
/

23RAG7

Sleeping

App Files Files Community

cb1716pics commited on Feb 21

Commit

1d3e5ce

verified ·

1 Parent(s): c4f2afd

Upload 3 files

Browse files

Files changed (2) hide show

app.py +8 -4
retrieval.py +36 -4

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import streamlit as st
 from generator import generate_response_from_document
-from retrieval import retrieve_documents
 from evaluation import calculate_metrics
 #from data_processing import load_data_from_faiss
 import time
@@ -11,14 +11,18 @@ st.title("RAG7 - Real World RAG System")
 global retrieved_documents
 retrieved_documents = []
 # @st.cache_data
 # def load_data():
 #     load_data_from_faiss()
 # data_status = load_data()
-time_taken_for_response = 'N/A'
 # Question Section
 st.subheader("Hi, What do you want to know today?")
 question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
@@ -26,7 +30,7 @@ question = st.text_area("Enter your question:", placeholder="Type your question
 # Submit Button
 if st.button("Submit"):
     start_time = time.time()
-    retrieved_documents = retrieve_documents(question, 5)
     response = generate_response_from_document(question, retrieved_documents)
     end_time = time.time()
     time_taken_for_response = end_time-start_time

 import streamlit as st
 from generator import generate_response_from_document
+from retrieval import retrieve_documents_hybrid
 from evaluation import calculate_metrics
 #from data_processing import load_data_from_faiss
 import time
 global retrieved_documents
 retrieved_documents = []
+global response
+response = ""
+global time_taken_for_response
+time_taken_for_response = 'N/A'
 # @st.cache_data
 # def load_data():
 #     load_data_from_faiss()
 # data_status = load_data()
 # Question Section
 st.subheader("Hi, What do you want to know today?")
 question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
 # Submit Button
 if st.button("Submit"):
     start_time = time.time()
+    retrieved_documents = retrieve_documents_hybrid(question, 10)
     response = generate_response_from_document(question, retrieved_documents)
     end_time = time.time()
     time_taken_for_response = end_time-start_time

retrieval.py CHANGED Viewed

@@ -2,16 +2,51 @@ import json
 import numpy as np
 from langchain.schema import Document
 import faiss
 from data_processing import embedding_model #, index, actual_docs
 retrieved_docs = None
 # Retrieval Function
 def retrieve_documents(query, top_k=5):
     query_dataset = find_query_dataset(query)
     #index, chunk_docs = load_data_from_faiss(query)
     faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
     index = faiss.read_index(faiss_index_path)
@@ -19,9 +54,6 @@ def retrieve_documents(query, top_k=5):
     _, nearest_indices = index.search(query_embedding, top_k)
-    with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
-        documents = json.load(f)  # Contains all documents for this dataset
     retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
     return retrieved_docs

 import numpy as np
 from langchain.schema import Document
 import faiss
+from rank_bm25 import BM25Okapi
 from data_processing import embedding_model #, index, actual_docs
 retrieved_docs = None
+def retrieve_documents_hybrid(query, top_k=5):
+    query_dataset = find_query_dataset(query)
+    with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
+        chunked_documents = json.load(f)  # Contains all documents for this dataset
+    faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
+    index = faiss.read_index(faiss_index_path)
+    # Tokenize documents for BM25
+    tokenized_docs = [doc.split() for doc in chunked_documents]
+    bm25 = BM25Okapi(tokenized_docs)
+    query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
+    query_embedding = query_embedding.reshape(1, -1)
+    # FAISS Search
+    _, nearest_indices = index.search(query_embedding, top_k)
+    faiss_docs = [chunked_documents[i] for i in nearest_indices[0]]
+    # BM25 Search
+    tokenized_query = query.split()
+    bm25_scores = bm25.get_scores(tokenized_query)
+    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
+    bm25_docs = [chunked_documents[i] for i in bm25_top_indices]
+    # Merge FAISS + BM25 Results
+    retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
+    return retrieved_docs
 # Retrieval Function
 def retrieve_documents(query, top_k=5):
     query_dataset = find_query_dataset(query)
     #index, chunk_docs = load_data_from_faiss(query)
+    with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
+        documents = json.load(f)  # Contains all documents for this dataset
     faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
     index = faiss.read_index(faiss_index_path)
     _, nearest_indices = index.search(query_embedding, top_k)
     retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
     return retrieved_docs