Spaces:
Sleeping
Sleeping
Update evaluation.py
Browse files- evaluation.py +9 -0
evaluation.py
CHANGED
@@ -102,6 +102,13 @@ def retrieve_ground_truths(question, dataset):
|
|
102 |
for split_name, instances in dataset.items():
|
103 |
for instance in instances:
|
104 |
if instance['question'] == question:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
return instance['response'] # Return the ground truth response immediately
|
106 |
return None # Return None if no match is found
|
107 |
|
@@ -143,6 +150,8 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
143 |
"response_time": time_taken,
|
144 |
"ground truth completeness": ground_truth_completeness
|
145 |
}
|
|
|
|
|
146 |
store_rmse(question, predicted_metrics, ground_truth_metrics)
|
147 |
# Now, make sure the values passed to RMSE calculation are numeric
|
148 |
predicted_completeness = predicted_metrics['completeness']
|
|
|
102 |
for split_name, instances in dataset.items():
|
103 |
for instance in instances:
|
104 |
if instance['question'] == question:
|
105 |
+
instance_response = instance['response']
|
106 |
+
ground_truth_metrics = {
|
107 |
+
"context_relevance": instance['relevance_score'],
|
108 |
+
"context_utilization": instance['utilization_score'],
|
109 |
+
"completeness": instance['completeness_score'],
|
110 |
+
"adherence": instance['adherence_score']
|
111 |
+
}
|
112 |
return instance['response'] # Return the ground truth response immediately
|
113 |
return None # Return None if no match is found
|
114 |
|
|
|
150 |
"response_time": time_taken,
|
151 |
"ground truth completeness": ground_truth_completeness
|
152 |
}
|
153 |
+
|
154 |
+
|
155 |
store_rmse(question, predicted_metrics, ground_truth_metrics)
|
156 |
# Now, make sure the values passed to RMSE calculation are numeric
|
157 |
predicted_completeness = predicted_metrics['completeness']
|