Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app.py +23 -25
- data_processing.py +7 -15
- evaluation.py +12 -5
- retrieval.py +34 -10
app.py
CHANGED
@@ -14,7 +14,7 @@ st.markdown(
|
|
14 |
<style>
|
15 |
.stTextArea textarea {
|
16 |
background-color: white !important;
|
17 |
-
font-size:
|
18 |
color: black !important;
|
19 |
}
|
20 |
</style>
|
@@ -82,19 +82,19 @@ if "query_dataset" not in st.session_state:
|
|
82 |
st.session_state.query_dataset = ''
|
83 |
|
84 |
recent_questions = load_recent_questions()
|
|
|
85 |
|
86 |
-
# for visualization
|
87 |
-
|
88 |
-
# response_time = [q["response_time"] for q in recent_data["questions"]]
|
89 |
-
# labels = [f"Q{i+1}" for i in range(len(response_time))] # Labels for X-axis
|
90 |
-
|
91 |
-
# fig, ax = plt.subplots()
|
92 |
-
# ax.set_xlabel("Recent Questions")
|
93 |
-
# ax.set_ylabel("Time Taken for Response")
|
94 |
-
# ax.legend()
|
95 |
-
# st.sidebar.pyplot(fig)
|
96 |
if recent_questions and "questions" in recent_questions and recent_questions["questions"]:
|
97 |
recent_qns = list(reversed(recent_questions["questions"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
st.sidebar.title("Analytics")
|
99 |
|
100 |
# Extract response times and labels
|
@@ -119,18 +119,6 @@ if recent_questions and "questions" in recent_questions and recent_questions["qu
|
|
119 |
st.sidebar.write(f"🔹 {q['question']}")
|
120 |
else:
|
121 |
st.sidebar.write("No recent questions")
|
122 |
-
# Separator
|
123 |
-
|
124 |
-
# Streamlit Sidebar for Recent Questions
|
125 |
-
|
126 |
-
|
127 |
-
# Submit Button
|
128 |
-
# if st.button("Submit"):
|
129 |
-
# start_time = time.time()
|
130 |
-
# st.session_state.retrieved_documents = retrieve_documents_hybrid(question, 10)
|
131 |
-
# st.session_state.response = generate_response_from_document(question, st.session_state.retrieved_documents)
|
132 |
-
# end_time = time.time()
|
133 |
-
# st.session_state.time_taken_for_response = end_time - start_time
|
134 |
|
135 |
if st.button("Submit"):
|
136 |
start_time = time.time()
|
@@ -140,7 +128,12 @@ if st.button("Submit"):
|
|
140 |
st.session_state.response = generate_response_from_document(question, st.session_state.retrieved_documents)
|
141 |
end_time = time.time()
|
142 |
st.session_state.time_taken_for_response = end_time - start_time
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
# Display stored response
|
146 |
st.subheader("Response")
|
@@ -164,10 +157,15 @@ col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics d
|
|
164 |
with col1:
|
165 |
if st.button("Show Metrics"):
|
166 |
st.session_state.metrics = calculate_metrics(question, st.session_state.query_dataset, st.session_state.response, st.session_state.retrieved_documents, st.session_state.time_taken_for_response)
|
|
|
167 |
else:
|
168 |
metrics_ = {}
|
169 |
|
170 |
with col2:
|
171 |
#st.text_area("Metrics:", value=metrics, height=100, disabled=True)
|
172 |
-
|
|
|
|
|
|
|
|
|
173 |
|
|
|
14 |
<style>
|
15 |
.stTextArea textarea {
|
16 |
background-color: white !important;
|
17 |
+
font-size: 24px !important;
|
18 |
color: black !important;
|
19 |
}
|
20 |
</style>
|
|
|
82 |
st.session_state.query_dataset = ''
|
83 |
|
84 |
recent_questions = load_recent_questions()
|
85 |
+
print(recent_questions)
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
if recent_questions and "questions" in recent_questions and recent_questions["questions"]:
|
88 |
recent_qns = list(reversed(recent_questions["questions"]))
|
89 |
+
|
90 |
+
print(recent_qns)
|
91 |
+
|
92 |
+
# Display Recent Questions
|
93 |
+
st.sidebar.title("Recent Questions")
|
94 |
+
for q in recent_qns: # Show latest first
|
95 |
+
st.sidebar.write(f"🔹 {q['question']}")
|
96 |
+
|
97 |
+
st.sidebar.markdown("---")
|
98 |
st.sidebar.title("Analytics")
|
99 |
|
100 |
# Extract response times and labels
|
|
|
119 |
st.sidebar.write(f"🔹 {q['question']}")
|
120 |
else:
|
121 |
st.sidebar.write("No recent questions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
if st.button("Submit"):
|
124 |
start_time = time.time()
|
|
|
128 |
st.session_state.response = generate_response_from_document(question, st.session_state.retrieved_documents)
|
129 |
end_time = time.time()
|
130 |
st.session_state.time_taken_for_response = end_time - start_time
|
131 |
+
|
132 |
+
# Store in session state
|
133 |
+
st.session_state.recent_questions.append({
|
134 |
+
"question": question,
|
135 |
+
"response_time": st.session_state.time_taken_for_response
|
136 |
+
})
|
137 |
|
138 |
# Display stored response
|
139 |
st.subheader("Response")
|
|
|
157 |
with col1:
|
158 |
if st.button("Show Metrics"):
|
159 |
st.session_state.metrics = calculate_metrics(question, st.session_state.query_dataset, st.session_state.response, st.session_state.retrieved_documents, st.session_state.time_taken_for_response)
|
160 |
+
metrics_ = st.session_state.metrics
|
161 |
else:
|
162 |
metrics_ = {}
|
163 |
|
164 |
with col2:
|
165 |
#st.text_area("Metrics:", value=metrics, height=100, disabled=True)
|
166 |
+
if len(metrics_) > 0:
|
167 |
+
st.json(metrics_)
|
168 |
+
|
169 |
+
save_recent_question(question, st.session_state.metrics)
|
170 |
+
|
171 |
|
data_processing.py
CHANGED
@@ -21,7 +21,6 @@ embedding_model = HuggingFaceEmbeddings(
|
|
21 |
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
22 |
query_dataset_data = {}
|
23 |
|
24 |
-
# File path for storing recently asked questions and metrics
|
25 |
RECENT_QUESTIONS_FILE = "data_local/recent_questions.json"
|
26 |
|
27 |
# Ensure the file exists and initialize if empty
|
@@ -36,10 +35,7 @@ chunk_docs = []
|
|
36 |
documents = []
|
37 |
query_dataset_data = {}
|
38 |
|
39 |
-
#
|
40 |
-
os.makedirs("data_local", exist_ok=True)
|
41 |
-
|
42 |
-
# Initialize a text splitter
|
43 |
text_splitter = RecursiveCharacterTextSplitter(
|
44 |
chunk_size=1024,
|
45 |
chunk_overlap=100
|
@@ -55,14 +51,12 @@ def create_faiss_index(dataset):
|
|
55 |
|
56 |
for split in ragbench_dataset.keys():
|
57 |
for row in ragbench_dataset[split]:
|
58 |
-
# Ensure document is a string before appending
|
59 |
doc = row["documents"]
|
60 |
if isinstance(doc, list):
|
61 |
-
# If doc is a list, join its elements into a single string
|
62 |
doc = " ".join(doc)
|
63 |
-
documents.append(doc) #
|
64 |
-
|
65 |
-
|
66 |
chunked_documents = chunk_documents(documents)
|
67 |
|
68 |
# Save documents in JSON (metadata storage)
|
@@ -76,7 +70,6 @@ def create_faiss_index(dataset):
|
|
76 |
# Convert embeddings to a NumPy array
|
77 |
embeddings_np = np.array(embeddings, dtype=np.float32)
|
78 |
|
79 |
-
|
80 |
# Save FAISS index
|
81 |
index = faiss.IndexHNSWFlat(embeddings_np.shape[1], 32) # 32 is the graph size
|
82 |
index.add(embeddings_np)
|
@@ -141,17 +134,16 @@ def load_recent_questions():
|
|
141 |
if os.path.exists(RECENT_QUESTIONS_FILE):
|
142 |
with open(RECENT_QUESTIONS_FILE, "r") as file:
|
143 |
return json.load(file)
|
144 |
-
return {"questions": []}
|
145 |
|
146 |
-
def save_recent_question(question,
|
147 |
data = load_recent_questions()
|
148 |
|
149 |
-
#data["questions"] = [q for q in data["questions"] if q["question"] != question]
|
150 |
if "question" in data["questions"] and question not in data["questions"]["question"]:
|
151 |
# Append new question & metrics
|
152 |
data["questions"].append({
|
153 |
"question": question,
|
154 |
-
"
|
155 |
})
|
156 |
|
157 |
# Keep only the last 5 questions
|
|
|
21 |
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
22 |
query_dataset_data = {}
|
23 |
|
|
|
24 |
RECENT_QUESTIONS_FILE = "data_local/recent_questions.json"
|
25 |
|
26 |
# Ensure the file exists and initialize if empty
|
|
|
35 |
documents = []
|
36 |
query_dataset_data = {}
|
37 |
|
38 |
+
# Text splitter
|
|
|
|
|
|
|
39 |
text_splitter = RecursiveCharacterTextSplitter(
|
40 |
chunk_size=1024,
|
41 |
chunk_overlap=100
|
|
|
51 |
|
52 |
for split in ragbench_dataset.keys():
|
53 |
for row in ragbench_dataset[split]:
|
|
|
54 |
doc = row["documents"]
|
55 |
if isinstance(doc, list):
|
|
|
56 |
doc = " ".join(doc)
|
57 |
+
documents.append(doc) #
|
58 |
+
|
59 |
+
# Chunking
|
60 |
chunked_documents = chunk_documents(documents)
|
61 |
|
62 |
# Save documents in JSON (metadata storage)
|
|
|
70 |
# Convert embeddings to a NumPy array
|
71 |
embeddings_np = np.array(embeddings, dtype=np.float32)
|
72 |
|
|
|
73 |
# Save FAISS index
|
74 |
index = faiss.IndexHNSWFlat(embeddings_np.shape[1], 32) # 32 is the graph size
|
75 |
index.add(embeddings_np)
|
|
|
134 |
if os.path.exists(RECENT_QUESTIONS_FILE):
|
135 |
with open(RECENT_QUESTIONS_FILE, "r") as file:
|
136 |
return json.load(file)
|
137 |
+
return {"questions": []}
|
138 |
|
139 |
+
def save_recent_question(question, metrics):
|
140 |
data = load_recent_questions()
|
141 |
|
|
|
142 |
if "question" in data["questions"] and question not in data["questions"]["question"]:
|
143 |
# Append new question & metrics
|
144 |
data["questions"].append({
|
145 |
"question": question,
|
146 |
+
"metrics": metrics
|
147 |
})
|
148 |
|
149 |
# Keep only the last 5 questions
|
evaluation.py
CHANGED
@@ -101,13 +101,13 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
101 |
|
102 |
# Predicted metrics
|
103 |
predicted_metrics = {
|
104 |
-
"RAG_model_response": response,
|
105 |
-
"ground_truth": ground_truth_answer,
|
106 |
"context_relevance": context_relevance(question, docs),
|
107 |
"context_utilization": context_utilization(response, docs),
|
108 |
"completeness": completeness(response, ground_truth_answer),
|
109 |
"adherence": adherence(response, docs),
|
110 |
-
"response_time": time_taken
|
|
|
|
|
111 |
}
|
112 |
return predicted_metrics
|
113 |
|
@@ -115,7 +115,8 @@ def retrieve_ground_truths(question, dataset):
|
|
115 |
for split_name, instances in dataset.items():
|
116 |
print(f"Processing {split_name} split")
|
117 |
for instance in instances:
|
118 |
-
if instance['question'] == question:
|
|
|
119 |
instance_id = instance['id']
|
120 |
instance_response = instance['response']
|
121 |
# ground_truth_metrics = {
|
@@ -128,4 +129,10 @@ def retrieve_ground_truths(question, dataset):
|
|
128 |
print(f"ID: {instance_id}, Response: {instance_response}")
|
129 |
return instance_response # Return ground truth response immediately
|
130 |
|
131 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# Predicted metrics
|
103 |
predicted_metrics = {
|
|
|
|
|
104 |
"context_relevance": context_relevance(question, docs),
|
105 |
"context_utilization": context_utilization(response, docs),
|
106 |
"completeness": completeness(response, ground_truth_answer),
|
107 |
"adherence": adherence(response, docs),
|
108 |
+
"response_time": time_taken,
|
109 |
+
"ground_truth": ground_truth_answer,
|
110 |
+
"RAG_model_response": response
|
111 |
}
|
112 |
return predicted_metrics
|
113 |
|
|
|
115 |
for split_name, instances in dataset.items():
|
116 |
print(f"Processing {split_name} split")
|
117 |
for instance in instances:
|
118 |
+
#if instance['question'] == question:
|
119 |
+
if is_similar(instance['question'], question):
|
120 |
instance_id = instance['id']
|
121 |
instance_response = instance['response']
|
122 |
# ground_truth_metrics = {
|
|
|
129 |
print(f"ID: {instance_id}, Response: {instance_response}")
|
130 |
return instance_response # Return ground truth response immediately
|
131 |
|
132 |
+
return None
|
133 |
+
|
134 |
+
def is_similar(question1, question2, threshold=0.85):
|
135 |
+
vectorizer = TfidfVectorizer()
|
136 |
+
vectors = vectorizer.fit_transform([question1, question2])
|
137 |
+
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
|
138 |
+
return similarity >= threshold
|
retrieval.py
CHANGED
@@ -5,11 +5,17 @@ import faiss
|
|
5 |
from rank_bm25 import BM25Okapi
|
6 |
from data_processing import embedding_model
|
7 |
from sentence_transformers import CrossEncoder
|
|
|
|
|
8 |
|
9 |
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
10 |
|
11 |
retrieved_docs = None
|
12 |
|
|
|
|
|
|
|
|
|
13 |
def retrieve_documents_hybrid(query, q_dataset, top_k=5):
|
14 |
with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f:
|
15 |
chunked_documents = json.load(f) # Contains all documents for this dataset
|
@@ -18,29 +24,48 @@ def retrieve_documents_hybrid(query, q_dataset, top_k=5):
|
|
18 |
index = faiss.read_index(faiss_index_path)
|
19 |
|
20 |
# Tokenize documents for BM25
|
21 |
-
tokenized_docs = [doc
|
22 |
bm25 = BM25Okapi(tokenized_docs)
|
23 |
|
24 |
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
25 |
query_embedding = query_embedding.reshape(1, -1)
|
26 |
|
27 |
# FAISS Search
|
28 |
-
|
29 |
-
faiss_docs = [chunked_documents[i] for i in
|
30 |
|
31 |
# BM25 Search
|
32 |
-
tokenized_query = query
|
33 |
bm25_scores = bm25.get_scores(tokenized_query)
|
34 |
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
|
35 |
bm25_docs = [chunked_documents[i] for i in bm25_top_indices]
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
return reranked_docs
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Retrieval Function
|
45 |
# def retrieve_documents(query, top_k=5):
|
46 |
# query_dataset = find_query_dataset(query)
|
@@ -62,9 +87,8 @@ def retrieve_documents_hybrid(query, q_dataset, top_k=5):
|
|
62 |
|
63 |
def remove_duplicate_documents(documents):
|
64 |
unique_documents = []
|
65 |
-
seen_documents = set()
|
66 |
for doc in documents:
|
67 |
-
# Using the page_content as a unique identifier for deduplication
|
68 |
doc_content = doc.page_content
|
69 |
if doc_content not in seen_documents:
|
70 |
unique_documents.append(doc)
|
|
|
5 |
from rank_bm25 import BM25Okapi
|
6 |
from data_processing import embedding_model
|
7 |
from sentence_transformers import CrossEncoder
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
import string
|
10 |
|
11 |
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
12 |
|
13 |
retrieved_docs = None
|
14 |
|
15 |
+
# Tokenize the documents and remove punctuation
|
16 |
+
def preprocess(doc):
|
17 |
+
return [word.lower() for word in word_tokenize(doc) if word not in string.punctuation]
|
18 |
+
|
19 |
def retrieve_documents_hybrid(query, q_dataset, top_k=5):
|
20 |
with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f:
|
21 |
chunked_documents = json.load(f) # Contains all documents for this dataset
|
|
|
24 |
index = faiss.read_index(faiss_index_path)
|
25 |
|
26 |
# Tokenize documents for BM25
|
27 |
+
tokenized_docs = [preprocess(doc) for doc in chunked_documents]
|
28 |
bm25 = BM25Okapi(tokenized_docs)
|
29 |
|
30 |
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
31 |
query_embedding = query_embedding.reshape(1, -1)
|
32 |
|
33 |
# FAISS Search
|
34 |
+
faiss_distances, faiss_indices = index.search(query_embedding, top_k)
|
35 |
+
faiss_docs = [chunked_documents[i] for i in faiss_indices[0]]
|
36 |
|
37 |
# BM25 Search
|
38 |
+
tokenized_query = preprocess(query)
|
39 |
bm25_scores = bm25.get_scores(tokenized_query)
|
40 |
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
|
41 |
bm25_docs = [chunked_documents[i] for i in bm25_top_indices]
|
42 |
|
43 |
+
# Combine FAISS + BM25 scores and retrieve docs
|
44 |
+
combined_results = set(bm25_top_indices).union(set(faiss_indices[0]))
|
45 |
+
|
46 |
+
combined_scores = rerank_docs_bm25faiss_scores(combined_results,bm25_scores, faiss_distances,faiss_indices)
|
47 |
+
reranked_docs = [chunked_documents[result[0]] for result in combined_scores[:top_k]]
|
48 |
+
|
49 |
+
# Merge FAISS + BM25 Results and re-rank
|
50 |
+
#retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
|
51 |
+
#reranked_docs = rerank_documents(query, retrieved_docs)
|
52 |
|
53 |
return reranked_docs
|
54 |
|
55 |
+
def rerank_docs_bm25faiss_scores(combined_results_,bm25_scores_, faiss_distances_,faiss_indices_):
|
56 |
+
final_results = []
|
57 |
+
for idx in combined_results_:
|
58 |
+
# Combine BM25 score and FAISS score for ranking (this could be more sophisticated)
|
59 |
+
bm25_score = bm25_scores_[idx]
|
60 |
+
faiss_score = 1 / (1 + faiss_distances_[0][np.where(faiss_indices_[0] == idx)]) # Inverse distance for relevance
|
61 |
+
final_results.append((idx, bm25_score, faiss_score))
|
62 |
+
|
63 |
+
# Sort final results by combined score (you can adjust the ranking strategy here)
|
64 |
+
final_results.sort(key=lambda x: (x[1] + x[2]), reverse=True)
|
65 |
+
|
66 |
+
return final_results
|
67 |
+
|
68 |
+
|
69 |
# Retrieval Function
|
70 |
# def retrieve_documents(query, top_k=5):
|
71 |
# query_dataset = find_query_dataset(query)
|
|
|
87 |
|
88 |
def remove_duplicate_documents(documents):
|
89 |
unique_documents = []
|
90 |
+
seen_documents = set()
|
91 |
for doc in documents:
|
|
|
92 |
doc_content = doc.page_content
|
93 |
if doc_content not in seen_documents:
|
94 |
unique_documents.append(doc)
|