Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +1 -1
- evaluation.py +1 -36
app.py
CHANGED
@@ -49,7 +49,7 @@ if st.session_state.recent_questions:
|
|
49 |
# Display Recent Questions
|
50 |
st.sidebar.title("Overall RMSE")
|
51 |
rmse_values = [q["metrics"]["RMSE"] for q in recent_qns if "metrics" in q and "RMSE" in q["metrics"]]
|
52 |
-
if any(rmse_values):
|
53 |
average_rmse = sum(rmse_values) / len(rmse_values) if rmse_values else 0
|
54 |
st.sidebar.write(f"📊 **Average RMSE:** {average_rmse:.4f} for {len(rmse_values)} questions")
|
55 |
|
|
|
49 |
# Display Recent Questions
|
50 |
st.sidebar.title("Overall RMSE")
|
51 |
rmse_values = [q["metrics"]["RMSE"] for q in recent_qns if "metrics" in q and "RMSE" in q["metrics"]]
|
52 |
+
if any(rmse_values) and len(rmse_values) > 0:
|
53 |
average_rmse = sum(rmse_values) / len(rmse_values) if rmse_values else 0
|
54 |
st.sidebar.write(f"📊 **Average RMSE:** {average_rmse:.4f} for {len(rmse_values)} questions")
|
55 |
|
evaluation.py
CHANGED
@@ -11,41 +11,6 @@ global ground_truth_answer, ground_truth_metrics
|
|
11 |
ground_truth_answer = ''
|
12 |
ground_truth_metrics = {}
|
13 |
|
14 |
-
# def calculate_metrics(question, response, docs, time_taken):
|
15 |
-
# data = load_ragbench()
|
16 |
-
# retrieve_ground_truths(question, data)
|
17 |
-
# # Predicted metrics
|
18 |
-
# predicted_metrics = {
|
19 |
-
# "ground_truth": ground_truth_answer,
|
20 |
-
# "context_relevance": context_relevance(question, docs),
|
21 |
-
# "context_utilization": context_utilization(response, docs),
|
22 |
-
# "completeness": completeness(response, ground_truth_answer),
|
23 |
-
# "adherence": adherence(response, docs),
|
24 |
-
# "response_time" : time_taken
|
25 |
-
# }
|
26 |
-
# return predicted_metrics
|
27 |
-
|
28 |
-
# def retrieve_ground_truths(question,ragbench_set):
|
29 |
-
# for dataset_name in ragbench_set.keys():
|
30 |
-
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
31 |
-
# print(f"Processing {split_name} split")
|
32 |
-
# for instance in instances: # Fixed: Corrected indentation
|
33 |
-
# # Check if the question (data) matches the query
|
34 |
-
# if instance['question'] == question:
|
35 |
-
# # If a match is found, retrieve id and response
|
36 |
-
# instance_id = instance['id']
|
37 |
-
# instance_response = instance['response']
|
38 |
-
# ground_truth_metrics = {
|
39 |
-
# "context_relevance": instance['relevance_score'],
|
40 |
-
# "context_utilization": instance['utilization_score'],
|
41 |
-
# "completeness": instance['completeness_score'],
|
42 |
-
# "adherence": instance['adherence_score']
|
43 |
-
# }
|
44 |
-
# ground_truth_answer = instance_response
|
45 |
-
# print(f"Match found in {split_name} split!")
|
46 |
-
# print(f"ID: {instance_id}, Response: {instance_response}")
|
47 |
-
# break # Exit after finding the first match (optional)
|
48 |
-
|
49 |
# Step 1: Helper function to compute cosine similarity
|
50 |
def compute_cosine_similarity(text1, text2):
|
51 |
if not text1 or not text2: # Check for empty or None values
|
@@ -109,7 +74,7 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
109 |
"adherence": adherence(response, docs),
|
110 |
}
|
111 |
|
112 |
-
rmse = compute_rmse(predicted_metrics, ground_truth_metrics)
|
113 |
|
114 |
metrics = {
|
115 |
"RMSE": rmse,
|
|
|
11 |
ground_truth_answer = ''
|
12 |
ground_truth_metrics = {}
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# Step 1: Helper function to compute cosine similarity
|
15 |
def compute_cosine_similarity(text1, text2):
|
16 |
if not text1 or not text2: # Check for empty or None values
|
|
|
74 |
"adherence": adherence(response, docs),
|
75 |
}
|
76 |
|
77 |
+
rmse = compute_rmse(predicted_metrics, ground_truth_metrics)
|
78 |
|
79 |
metrics = {
|
80 |
"RMSE": rmse,
|