Spaces:
Sleeping
Sleeping
Update evaluation.py
Browse files- evaluation.py +16 -7
evaluation.py
CHANGED
@@ -109,22 +109,31 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
109 |
# Ensure ground_truth_answer is not empty before proceeding
|
110 |
if ground_truth_answer is None:
|
111 |
ground_truth_answer = "" # Default to an empty string if no ground truth is found
|
112 |
-
|
|
|
|
|
|
|
|
|
113 |
# Predicted metrics
|
114 |
predicted_metrics = {
|
115 |
"RAG_model_response": response,
|
116 |
"ground_truth": ground_truth_answer,
|
117 |
"context_relevance": context_relevance(question, docs),
|
118 |
"context_utilization": context_utilization(response, docs),
|
119 |
-
"completeness": completeness(response, ground_truth_answer),
|
120 |
"adherence": adherence(response, docs),
|
121 |
"response_time": time_taken
|
122 |
}
|
123 |
-
#
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
128 |
return predicted_metrics
|
129 |
|
130 |
''' def retrieve_ground_truths(question, dataset):
|
|
|
109 |
# Ensure ground_truth_answer is not empty before proceeding
|
110 |
if ground_truth_answer is None:
|
111 |
ground_truth_answer = "" # Default to an empty string if no ground truth is found
|
112 |
+
|
113 |
+
# Convert ground truth to numeric form (e.g., using cosine similarity or some metric)
|
114 |
+
# Here, let's assume completeness is based on cosine similarity between the response and the ground truth
|
115 |
+
ground_truth_completeness = compute_cosine_similarity(response, ground_truth_answer)
|
116 |
+
|
117 |
# Predicted metrics
|
118 |
predicted_metrics = {
|
119 |
"RAG_model_response": response,
|
120 |
"ground_truth": ground_truth_answer,
|
121 |
"context_relevance": context_relevance(question, docs),
|
122 |
"context_utilization": context_utilization(response, docs),
|
123 |
+
"completeness": compute_cosine_similarity(response, ground_truth_answer), #completeness(response, ground_truth_answer),
|
124 |
"adherence": adherence(response, docs),
|
125 |
"response_time": time_taken
|
126 |
}
|
127 |
+
# Now, make sure the values passed to RMSE calculation are numeric
|
128 |
+
predicted_completeness = predicted_metrics['completeness']
|
129 |
+
|
130 |
+
# Ensure both predicted_completeness and ground_truth_completeness are numeric before calculating RMSE
|
131 |
+
if isinstance(predicted_completeness, (int, float)) and isinstance(ground_truth_completeness, (int, float)):
|
132 |
+
rmse_value = compute_rmse([predicted_completeness], [ground_truth_completeness])
|
133 |
+
predicted_metrics["rmse"] = rmse_value # Adding RMSE to metrics
|
134 |
+
else:
|
135 |
+
predicted_metrics["rmse"] = "Invalid RMSE calculation"
|
136 |
+
|
137 |
return predicted_metrics
|
138 |
|
139 |
''' def retrieve_ground_truths(question, dataset):
|