cb1716pics commited on
Commit
ce3af46
·
verified ·
1 Parent(s): ced5431

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +118 -170
  2. data_processing.py +29 -1
app.py CHANGED
@@ -1,196 +1,144 @@
1
- # import streamlit as st
2
- # from generator import generate_response_from_document
3
- # from retrieval import retrieve_documents_hybrid
4
- # from evaluation import calculate_metrics
5
- # #from data_processing import load_data_from_faiss
6
- # import time
7
-
8
- # # Page Title
9
- # st.title("RAG7 - Real World RAG System")
10
-
11
- # # global retrieved_documents
12
- # # retrieved_documents = []
13
-
14
- # # global response
15
- # # response = ""
16
-
17
- # # global time_taken_for_response
18
- # # time_taken_for_response = 'N/A'
19
-
20
- # # @st.cache_data
21
- # # def load_data():
22
- # # load_data_from_faiss()
23
-
24
- # # data_status = load_data()
25
-
26
- # # Question Section
27
- # st.subheader("Hi, What do you want to know today?")
28
- # question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
29
-
30
- # # # Submit Button
31
- # # if st.button("Submit"):
32
- # # start_time = time.time()
33
- # # retrieved_documents = retrieve_documents_hybrid(question, 10)
34
- # # response = generate_response_from_document(question, retrieved_documents)
35
- # # end_time = time.time()
36
- # # time_taken_for_response = end_time-start_time
37
- # # else:
38
- # # response = ""
39
-
40
- # # # Response Section
41
- # # st.subheader("Response")
42
- # # st.text_area("Generated Response:", value=response, height=150, disabled=True)
43
-
44
- # # # Metrics Section
45
- # # st.subheader("Metrics")
46
-
47
- # # col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics display
48
-
49
- # # with col1:
50
- # # if st.button("Calculate Metrics"):
51
- # # metrics = calculate_metrics(question, response, retrieved_documents, time_taken_for_response)
52
- # # else:
53
- # # metrics = ""
54
-
55
- # # with col2:
56
- # # st.text_area("Metrics:", value=metrics, height=100, disabled=True)
57
-
58
- # if "retrieved_documents" not in st.session_state:
59
- # st.session_state.retrieved_documents = []
60
- # if "response" not in st.session_state:
61
- # st.session_state.response = ""
62
- # if "time_taken_for_response" not in st.session_state:
63
- # st.session_state.time_taken_for_response = "N/A"
64
 
65
  # # Submit Button
66
  # if st.button("Submit"):
67
  # start_time = time.time()
68
- # st.session_state.retrieved_documents = retrieve_documents_hybrid(question, 10)
69
- # st.session_state.response = generate_response_from_document(question, st.session_state.retrieved_documents)
70
  # end_time = time.time()
71
- # st.session_state.time_taken_for_response = end_time - start_time
 
 
72
 
73
- # # Display stored response
74
  # st.subheader("Response")
75
- # st.text_area("Generated Response:", value=st.session_state.response, height=150, disabled=True)
 
 
 
76
 
77
  # col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics display
78
 
79
- # # Calculate Metrics Button
80
  # with col1:
81
  # if st.button("Calculate Metrics"):
82
- # metrics = calculate_metrics(question, st.session_state.response, st.session_state.retrieved_documents, st.session_state.time_taken_for_response)
83
  # else:
84
- # metrics = {}
85
 
86
  # with col2:
87
- # #st.text_area("Metrics:", value=metrics, height=100, disabled=True)
88
- # st.json(metrics)
89
 
90
- import streamlit as st
91
- import plotly.express as px
92
- from datasets import load_dataset, Dataset, DatasetDict
93
- from generator import generate_response_from_document
94
- from retrieval import retrieve_documents_hybrid
95
- from evaluation import calculate_metrics
96
- import time
 
97
 
98
- # Hugging Face Dataset Details
99
- HF_DATASET_REPO = "cb1716pics/23RAG7_recent_questions" # Hugging Face repo
100
-
101
- # Load Dataset from Hugging Face
102
- @st.cache_resource
103
- def load_hf_dataset():
104
- try:
105
- return load_dataset(HF_DATASET_REPO)
106
- except:
107
- return DatasetDict({"recent": Dataset.from_dict({"question": [], "response": [], "metrics": []})})
108
-
109
- dataset = load_hf_dataset()
110
-
111
- # Function to Save Data to Hugging Face Dataset
112
- def save_to_hf_dataset(question, response, metrics):
113
- global dataset
114
- new_data = {
115
- "question": [question],
116
- "response": [response],
117
- "metrics": [metrics]
118
- }
119
-
120
- # Convert existing dataset to a list and append new data
121
- dataset_dict = dataset["recent"].to_dict()
122
- for key in new_data.keys():
123
- dataset_dict[key] = dataset_dict.get(key, []) + new_data[key]
124
-
125
- # Keep only the last 10 entries
126
- for key in dataset_dict.keys():
127
- dataset_dict[key] = dataset_dict[key][-10:]
128
-
129
- # Convert back to dataset and push to Hugging Face
130
- dataset["recent"] = Dataset.from_dict(dataset_dict)
131
- dataset.push_to_hub(HF_DATASET_REPO)
132
-
133
- # Streamlit UI
134
- st.title("🔍 RAG7 - Real World RAG System")
135
-
136
- # Sidebar - Recent Questions
137
- st.sidebar.header("📌 Recent Questions")
138
- if len(dataset["recent"]) > 0:
139
- for q in dataset["recent"]["question"][-10:]:
140
- st.sidebar.write(f"🔹 {q}")
141
-
142
- # Sidebar - Analytics with Graph
143
- st.sidebar.header("📊 Analytics Overview")
144
- if len(dataset["recent"]) > 0:
145
- # Extract recent metrics for visualization
146
- metrics_data = dataset["recent"]["metrics"][-10:]
147
- metrics_keys = ["context_relevance", "context_utilization", "completeness", "adherence"]
148
-
149
- # Prepare a dictionary for graphing
150
- graph_data = {key: [m[key] for m in metrics_data] for key in metrics_keys}
151
- graph_data["Question #"] = list(range(1, len(metrics_data) + 1))
152
-
153
- # Convert to DataFrame for Plotly
154
- import pandas as pd
155
- df = pd.DataFrame(graph_data)
156
-
157
- # Plot Metrics Over Time
158
- fig = px.line(df, x="Question #", y=metrics_keys,
159
- labels={"value": "Score", "variable": "Metric"},
160
- title="📈 Model Performance Over Recent Questions")
161
- st.sidebar.plotly_chart(fig, use_container_width=True)
162
-
163
- # Evaluate Button
164
- if st.sidebar.button("⚡ Evaluate RAG Model"):
165
- st.sidebar.success("✅ Model Evaluation Triggered!")
166
-
167
- # Main Section - User Input
168
- st.subheader("💬 Ask a Question")
169
- question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
170
 
171
  # Submit Button
172
- if st.button("🚀 Submit"):
 
 
 
 
 
 
 
173
  start_time = time.time()
174
- retrieved_documents = retrieve_documents_hybrid(question, 10)
175
- response = generate_response_from_document(question, retrieved_documents)
176
  end_time = time.time()
177
- time_taken_for_response = end_time - start_time
178
 
179
- # Calculate Metrics
180
- metrics = calculate_metrics(question, response, retrieved_documents, time_taken_for_response)
181
 
182
- # Save Data
183
- save_to_hf_dataset(question, response, metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- # Display Response
186
- st.subheader("💡 Response")
187
- st.text_area("Generated Response:", value=response, height=150, disabled=True)
188
 
189
- # Display Metrics with Bar Chart
190
- st.subheader("📊 Metrics")
191
- st.json(metrics)
 
 
 
192
 
193
- # Plot Bar Chart for Metrics
194
- metric_df = pd.DataFrame({"Metric": list(metrics.keys()), "Score": list(metrics.values())})
195
- fig2 = px.bar(metric_df, x="Metric", y="Score", title="📊 Current Query Metrics")
196
- st.plotly_chart(fig2, use_container_width=True)
 
1
+ import streamlit as st
2
+ from generator import generate_response_from_document
3
+ from retrieval import retrieve_documents_hybrid
4
+ from evaluation import calculate_metrics
5
+ from data_processing import load_recent_questions, save_recent_question
6
+ import time
7
+
8
+ # Page Title
9
+ st.title("RAG7 - Real World RAG System")
10
+
11
+ # global retrieved_documents
12
+ # retrieved_documents = []
13
+
14
+ # global response
15
+ # response = ""
16
+
17
+ # global time_taken_for_response
18
+ # time_taken_for_response = 'N/A'
19
+
20
+ # @st.cache_data
21
+ # def load_data():
22
+ # load_data_from_faiss()
23
+
24
+ # data_status = load_data()
25
+
26
+ # Question Section
27
+ st.subheader("Hi, What do you want to know today?")
28
+ question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # # Submit Button
31
  # if st.button("Submit"):
32
  # start_time = time.time()
33
+ # retrieved_documents = retrieve_documents_hybrid(question, 10)
34
+ # response = generate_response_from_document(question, retrieved_documents)
35
  # end_time = time.time()
36
+ # time_taken_for_response = end_time-start_time
37
+ # else:
38
+ # response = ""
39
 
40
+ # # Response Section
41
  # st.subheader("Response")
42
+ # st.text_area("Generated Response:", value=response, height=150, disabled=True)
43
+
44
+ # # Metrics Section
45
+ # st.subheader("Metrics")
46
 
47
  # col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics display
48
 
 
49
  # with col1:
50
  # if st.button("Calculate Metrics"):
51
+ # metrics = calculate_metrics(question, response, retrieved_documents, time_taken_for_response)
52
  # else:
53
+ # metrics = ""
54
 
55
  # with col2:
56
+ # st.text_area("Metrics:", value=metrics, height=100, disabled=True)
 
57
 
58
+ if "retrieved_documents" not in st.session_state:
59
+ st.session_state.retrieved_documents = []
60
+ if "response" not in st.session_state:
61
+ st.session_state.response = ""
62
+ if "time_taken_for_response" not in st.session_state:
63
+ st.session_state.time_taken_for_response = "N/A"
64
+ if "metrics" not in st.session_state:
65
+ st.session_state.metrics = {}
66
 
67
+ # Streamlit Sidebar for Recent Questions
68
+ st.sidebar.title("Recent Questions")
69
+
70
+ recent_data = load_recent_questions()
71
+ for q in reversed(recent_data["questions"]): # Show latest first
72
+ st.sidebar.write(f"🔹 {q['question']}")
73
+
74
+ st.sidebar.markdown("---") # Separator
75
+
76
+
77
+ import matplotlib.pyplot as plt
78
+
79
+
80
+ # for visualization
81
+ st.sidebar.title("Analytics")
82
+
83
+ context_relevance = [q["metrics"]["context_relevance"] for q in recent_data["questions"]]
84
+ response_time = [q["metrics"]["response_time"] for q in recent_data["questions"]]
85
+ labels = [f"Q{i+1}" for i in range(len(context_relevance))] # Labels for X-axis
86
+
87
+ fig, ax = plt.subplots()
88
+ ax.plot(labels, context_relevance, marker="o", label="Context Relevance")
89
+ ax.plot(labels, response_time, marker="s", label="Response Time (sec)")
90
+ ax.set_xlabel("Recent Questions")
91
+ ax.set_ylabel("Scores")
92
+ ax.legend()
93
+ st.sidebar.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # Submit Button
96
+ # if st.button("Submit"):
97
+ # start_time = time.time()
98
+ # st.session_state.retrieved_documents = retrieve_documents_hybrid(question, 10)
99
+ # st.session_state.response = generate_response_from_document(question, st.session_state.retrieved_documents)
100
+ # end_time = time.time()
101
+ # st.session_state.time_taken_for_response = end_time - start_time
102
+
103
+ if st.button("Submit"):
104
  start_time = time.time()
105
+ st.session_state.retrieved_documents = retrieve_documents_hybrid(question, 10)
106
+ st.session_state.response = generate_response_from_document(question, st.session_state.retrieved_documents)
107
  end_time = time.time()
108
+ st.session_state.time_taken_for_response = end_time - start_time
109
 
110
+ # Calculate metrics
111
+ st.session_state.metrics = calculate_metrics(question, st.session_state.response, st.session_state.retrieved_documents, st.session_state.time_taken_for_response)
112
 
113
+ # Save question & metrics
114
+ save_recent_question(question, st.session_state.metrics)
115
+
116
+
117
+ # Display stored response
118
+ st.subheader("Response")
119
+ st.text_area("Generated Response:", value=st.session_state.response, height=150, disabled=True)
120
+
121
+ col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics display
122
+
123
+ # # Calculate Metrics Button
124
+ # with col1:
125
+ # if st.button("Calculate Metrics"):
126
+ # metrics = calculate_metrics(question, st.session_state.response, st.session_state.retrieved_documents, st.session_state.time_taken_for_response)
127
+ # else:
128
+ # metrics = {}
129
+
130
+ # with col2:
131
+ # #st.text_area("Metrics:", value=metrics, height=100, disabled=True)
132
+ # st.json(metrics)
133
 
 
 
 
134
 
135
+ # Calculate Metrics Button
136
+ with col1:
137
+ if st.button("Show Metrics"):
138
+ metrics_ = st.session_state.metrics
139
+ else:
140
+ metrics_ = {}
141
 
142
+ with col2:
143
+ #st.text_area("Metrics:", value=metrics, height=100, disabled=True)
144
+ st.json(metrics_)
 
data_processing.py CHANGED
@@ -20,6 +20,14 @@ embedding_model = HuggingFaceEmbeddings(
20
 
21
  reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
22
 
 
 
 
 
 
 
 
 
23
  all_documents = []
24
  ragbench = {}
25
  index = None
@@ -106,7 +114,6 @@ def load_chunks(query_dataset):
106
  def load_data_from_faiss(query_dataset):
107
  load_faiss(query_dataset)
108
  load_chunks(query_dataset)
109
- #return index_, chunks_
110
 
111
  def rerank_documents(query, retrieved_docs):
112
  doc_texts = [doc for doc in retrieved_docs]
@@ -114,3 +121,24 @@ def rerank_documents(query, retrieved_docs):
114
  ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
115
  return ranked_docs[:5] # Return top 5 most relevant
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
22
 
23
+ # File path for storing recently asked questions and metrics
24
+ RECENT_QUESTIONS_FILE = "data_local/recent_questions.json"
25
+
26
+ # Ensure the file exists and initialize if empty
27
+ if not os.path.exists(RECENT_QUESTIONS_FILE):
28
+ with open(RECENT_QUESTIONS_FILE, "w") as file:
29
+ json.dump({"questions": []}, file, indent=4)
30
+
31
  all_documents = []
32
  ragbench = {}
33
  index = None
 
114
  def load_data_from_faiss(query_dataset):
115
  load_faiss(query_dataset)
116
  load_chunks(query_dataset)
 
117
 
118
  def rerank_documents(query, retrieved_docs):
119
  doc_texts = [doc for doc in retrieved_docs]
 
121
  ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
122
  return ranked_docs[:5] # Return top 5 most relevant
123
 
124
+ def load_recent_questions():
125
+ if os.path.exists(RECENT_QUESTIONS_FILE):
126
+ with open(RECENT_QUESTIONS_FILE, "r") as file:
127
+ return json.load(file)
128
+ return {"questions": []} # Default structure if file doesn't exist
129
+
130
+ def save_recent_question(question, metrics):
131
+ data = load_recent_questions()
132
+
133
+ # Append new question & metrics
134
+ data["questions"].append({
135
+ "question": question,
136
+ "metrics": metrics
137
+ })
138
+
139
+ # Keep only the last 5 questions
140
+ data["questions"] = data["questions"][-5:]
141
+
142
+ # Write back to file
143
+ with open(RECENT_QUESTIONS_FILE, "w") as file:
144
+ json.dump(data, file, indent=4)