Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- data_processing.py +4 -4
- evaluation.py +71 -34
data_processing.py
CHANGED
@@ -76,11 +76,11 @@ def create_faiss_index(dataset):
|
|
76 |
|
77 |
def load_ragbench():
|
78 |
global ragbench
|
79 |
-
if ragbench
|
80 |
return ragbench
|
81 |
-
|
82 |
-
'
|
83 |
-
|
84 |
ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
|
85 |
return ragbench
|
86 |
|
|
|
76 |
|
77 |
def load_ragbench():
|
78 |
global ragbench
|
79 |
+
if ragbench:
|
80 |
return ragbench
|
81 |
+
datasets = ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
|
82 |
+
'tatqa', 'techqa']
|
83 |
+
for dataset in datasets:
|
84 |
ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
|
85 |
return ragbench
|
86 |
|
evaluation.py
CHANGED
@@ -11,40 +11,40 @@ global ground_truth_answer, ground_truth_metrics
|
|
11 |
ground_truth_answer = ''
|
12 |
ground_truth_metrics = {}
|
13 |
|
14 |
-
def calculate_metrics(question, response, docs, time_taken):
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
def retrieve_ground_truths(question,ragbench_set):
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
# Step 1: Helper function to compute cosine similarity
|
50 |
def compute_cosine_similarity(text1, text2):
|
@@ -91,4 +91,41 @@ def adherence(response, relevant_documents):
|
|
91 |
def compute_rmse(predicted_values, ground_truth_values):
|
92 |
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
|
|
|
11 |
ground_truth_answer = ''
|
12 |
ground_truth_metrics = {}
|
13 |
|
14 |
+
# def calculate_metrics(question, response, docs, time_taken):
|
15 |
+
# data = load_ragbench()
|
16 |
+
# retrieve_ground_truths(question, data)
|
17 |
+
# # Predicted metrics
|
18 |
+
# predicted_metrics = {
|
19 |
+
# "ground_truth": ground_truth_answer,
|
20 |
+
# "context_relevance": context_relevance(question, docs),
|
21 |
+
# "context_utilization": context_utilization(response, docs),
|
22 |
+
# "completeness": completeness(response, ground_truth_answer),
|
23 |
+
# "adherence": adherence(response, docs),
|
24 |
+
# "response_time" : time_taken
|
25 |
+
# }
|
26 |
+
# return predicted_metrics
|
27 |
+
|
28 |
+
# def retrieve_ground_truths(question,ragbench_set):
|
29 |
+
# for dataset_name in ragbench_set.keys():
|
30 |
+
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
31 |
+
# print(f"Processing {split_name} split")
|
32 |
+
# for instance in instances: # Fixed: Corrected indentation
|
33 |
+
# # Check if the question (data) matches the query
|
34 |
+
# if instance['question'] == question:
|
35 |
+
# # If a match is found, retrieve id and response
|
36 |
+
# instance_id = instance['id']
|
37 |
+
# instance_response = instance['response']
|
38 |
+
# ground_truth_metrics = {
|
39 |
+
# "context_relevance": instance['relevance_score'],
|
40 |
+
# "context_utilization": instance['utilization_score'],
|
41 |
+
# "completeness": instance['completeness_score'],
|
42 |
+
# "adherence": instance['adherence_score']
|
43 |
+
# }
|
44 |
+
# ground_truth_answer = instance_response
|
45 |
+
# print(f"Match found in {split_name} split!")
|
46 |
+
# print(f"ID: {instance_id}, Response: {instance_response}")
|
47 |
+
# break # Exit after finding the first match (optional)
|
48 |
|
49 |
# Step 1: Helper function to compute cosine similarity
|
50 |
def compute_cosine_similarity(text1, text2):
|
|
|
91 |
def compute_rmse(predicted_values, ground_truth_values):
|
92 |
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
93 |
|
94 |
+
def calculate_metrics(question, response, docs, time_taken):
|
95 |
+
data = load_ragbench()
|
96 |
+
ground_truth_answer = retrieve_ground_truths(question, data) # Store the ground truth answer
|
97 |
+
|
98 |
+
# Ensure ground_truth_answer is not empty before proceeding
|
99 |
+
if ground_truth_answer is None:
|
100 |
+
ground_truth_answer = "" # Default to an empty string if no ground truth is found
|
101 |
+
|
102 |
+
# Predicted metrics
|
103 |
+
predicted_metrics = {
|
104 |
+
"ground_truth": ground_truth_answer,
|
105 |
+
"context_relevance": context_relevance(question, docs),
|
106 |
+
"context_utilization": context_utilization(response, docs),
|
107 |
+
"completeness": completeness(response, ground_truth_answer),
|
108 |
+
"adherence": adherence(response, docs),
|
109 |
+
"response_time": time_taken
|
110 |
+
}
|
111 |
+
return predicted_metrics
|
112 |
+
|
113 |
+
def retrieve_ground_truths(question, ragbench_set):
|
114 |
+
for dataset_name in ragbench_set.keys():
|
115 |
+
for split_name, instances in ragbench_set[dataset_name].items():
|
116 |
+
print(f"Processing {split_name} split")
|
117 |
+
for instance in instances:
|
118 |
+
if instance['question'] == question:
|
119 |
+
instance_id = instance['id']
|
120 |
+
instance_response = instance['response']
|
121 |
+
# ground_truth_metrics = {
|
122 |
+
# "context_relevance": instance['relevance_score'],
|
123 |
+
# "context_utilization": instance['utilization_score'],
|
124 |
+
# "completeness": instance['completeness_score'],
|
125 |
+
# "adherence": instance['adherence_score']
|
126 |
+
# }
|
127 |
+
print(f"Match found in {split_name} split!")
|
128 |
+
print(f"ID: {instance_id}, Response: {instance_response}")
|
129 |
+
return instance_response # Return ground truth response immediately
|
130 |
|
131 |
+
return None # Return None if no match is found
|