Spaces:
Sleeping
Sleeping
File size: 6,629 Bytes
1b04b96 da626d3 a523549 3309fe6 1b04b96 58a211a 1b04b96 c4f2afd 1b04b96 c4f2afd 1b04b96 c4f2afd 1b04b96 c4f2afd 1b04b96 0e36212 1b04b96 fdc80c8 0e36212 58a211a 0e36212 ece1395 0e36212 58a211a 0e36212 58a211a 9665824 ece1395 9665824 0e36212 9665824 0e36212 1b04b96 0e36212 ece1395 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from data_processing import load_query_dataset
global ground_truth_answer, ground_truth_metrics
ground_truth_answer = ''
ground_truth_metrics = {}
# def calculate_metrics(question, response, docs, time_taken):
# data = load_ragbench()
# retrieve_ground_truths(question, data)
# # Predicted metrics
# predicted_metrics = {
# "ground_truth": ground_truth_answer,
# "context_relevance": context_relevance(question, docs),
# "context_utilization": context_utilization(response, docs),
# "completeness": completeness(response, ground_truth_answer),
# "adherence": adherence(response, docs),
# "response_time" : time_taken
# }
# return predicted_metrics
# def retrieve_ground_truths(question,ragbench_set):
# for dataset_name in ragbench_set.keys():
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
# print(f"Processing {split_name} split")
# for instance in instances: # Fixed: Corrected indentation
# # Check if the question (data) matches the query
# if instance['question'] == question:
# # If a match is found, retrieve id and response
# instance_id = instance['id']
# instance_response = instance['response']
# ground_truth_metrics = {
# "context_relevance": instance['relevance_score'],
# "context_utilization": instance['utilization_score'],
# "completeness": instance['completeness_score'],
# "adherence": instance['adherence_score']
# }
# ground_truth_answer = instance_response
# print(f"Match found in {split_name} split!")
# print(f"ID: {instance_id}, Response: {instance_response}")
# break # Exit after finding the first match (optional)
# Step 1: Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
if not text1 or not text2: # Check for empty or None values
print("Error: One or both input texts are empty. Returning similarity as 0.")
return 0.0
vectorizer = TfidfVectorizer(stop_words="english")
try:
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
return similarity
except ValueError as e:
print(f"Error in vectorization: {e}. Returning similarity as 0.")
return 0.0
# Step 2: Metric 1 - Context Relevance
def context_relevance(question, relevant_documents):
# combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
return compute_cosine_similarity(question, combined_docs)
# Step 3: Metric 2 - Context Utilization
def context_utilization(response, relevant_documents):
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
return compute_cosine_similarity(response, combined_docs)
# Step 4: Metric 3 - Completeness
def completeness(response, ground_truth_answer):
return compute_cosine_similarity(response, ground_truth_answer)
# Step 5: Metric 4 - Adherence
def adherence(response, relevant_documents):
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
response_tokens = set(response.split())
relevant_tokens = set(combined_docs.split())
supported_tokens = response_tokens.intersection(relevant_tokens)
return len(supported_tokens) / len(response_tokens) >= 0.5
# Step 6: Compute RMSE for metrics
def compute_rmse(predicted_values, ground_truth_values):
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
def calculate_metrics(question, q_dataset, response, docs, time_taken):
data = load_query_dataset(q_dataset)
ground_truth_answer, ground_truth_metrics = retrieve_ground_truths(question, data) # Store the ground truth answer
# Ensure ground_truth_answer is not empty before proceeding
if ground_truth_answer is None:
ground_truth_answer = "" # Default to an empty string if no ground truth is found
# Predicted metrics
predicted_metrics = {
"context_relevance": context_relevance(question, docs),
"context_utilization": context_utilization(response, docs),
"completeness": completeness(response, ground_truth_answer),
"adherence": adherence(response, docs),
}
rmse = compute_rmse(predicted_metrics, ground_truth_metrics),
metrics = {
"RMSE": rmse,
"metrics":predicted_metrics,
"response_time": time_taken,
"ground_truth": ground_truth_answer,
"RAG_model_response": response,
}
return metrics
def retrieve_ground_truths(question, dataset):
for split_name, instances in dataset.items():
print(f"Processing {split_name} split")
for instance in instances:
#if instance['question'] == question:
if is_similar(instance['question'], question):
instance_id = instance['id']
ground_truth = instance['response']
ground_truth_metrics_ = {
"context_relevance": instance['relevance_score'],
"context_utilization": instance['utilization_score'],
"completeness": instance['completeness_score'],
"adherence": instance['adherence_score']
}
print(f"Match found in {split_name} split!")
print(f"ID: {instance_id}, Response: {ground_truth}")
return ground_truth , ground_truth_metrics_ # Return ground truth response immediately
return None, None
def is_similar(question1, question2, threshold=0.85):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([question1, question2])
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
return similarity >= threshold |