Spaces:
Sleeping
Sleeping
Update evaluation.py
Browse files- evaluation.py +4 -2
evaluation.py
CHANGED
@@ -113,7 +113,8 @@ def retrieve_ground_truths(question, dataset):
|
|
113 |
"context_relevance": instance['relevance_score'],
|
114 |
"context_utilization": instance['utilization_score'],
|
115 |
"completeness": instance['completeness_score'],
|
116 |
-
"adherence": adherence_numerical
|
|
|
117 |
}
|
118 |
return instance_response, ground_truth_metrics # Return the ground truth response immediately
|
119 |
return None,None # Return None if no match is found
|
@@ -171,7 +172,8 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
171 |
"context_relevance": context_relevance(question, docs),
|
172 |
"context_utilization": context_utilization(response, docs),
|
173 |
"completeness": compute_cosine_similarity(response, ground_truth_answer), #completeness(response, ground_truth_answer),
|
174 |
-
"adherence": adherence(response, docs)
|
|
|
175 |
}
|
176 |
|
177 |
store_rmse(question, predicted_metrics, ground_truth_metrics)
|
|
|
113 |
"context_relevance": instance['relevance_score'],
|
114 |
"context_utilization": instance['utilization_score'],
|
115 |
"completeness": instance['completeness_score'],
|
116 |
+
"adherence": adherence_numerical,
|
117 |
+
"response_time": time_taken
|
118 |
}
|
119 |
return instance_response, ground_truth_metrics # Return the ground truth response immediately
|
120 |
return None,None # Return None if no match is found
|
|
|
172 |
"context_relevance": context_relevance(question, docs),
|
173 |
"context_utilization": context_utilization(response, docs),
|
174 |
"completeness": compute_cosine_similarity(response, ground_truth_answer), #completeness(response, ground_truth_answer),
|
175 |
+
"adherence": adherence(response, docs),
|
176 |
+
"response_time": time_taken
|
177 |
}
|
178 |
|
179 |
store_rmse(question, predicted_metrics, ground_truth_metrics)
|