23RAG7 / evaluation.py
vamseelatha2002's picture
Update evaluation.py
2a67170 verified
raw
history blame
7.75 kB
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from data_processing import load_query_dataset
global ground_truth_answer, ground_truth_metrics
ground_truth_answer = ''
ground_truth_metrics = {}
# def calculate_metrics(question, response, docs, time_taken):
# data = load_ragbench()
# retrieve_ground_truths(question, data)
# # Predicted metrics
# predicted_metrics = {
# "ground_truth": ground_truth_answer,
# "context_relevance": context_relevance(question, docs),
# "context_utilization": context_utilization(response, docs),
# "completeness": completeness(response, ground_truth_answer),
# "adherence": adherence(response, docs),
# "response_time" : time_taken
# }
# return predicted_metrics
# def retrieve_ground_truths(question,ragbench_set):
# for dataset_name in ragbench_set.keys():
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
# print(f"Processing {split_name} split")
# for instance in instances: # Fixed: Corrected indentation
# # Check if the question (data) matches the query
# if instance['question'] == question:
# # If a match is found, retrieve id and response
# instance_id = instance['id']
# instance_response = instance['response']
# ground_truth_metrics = {
# "context_relevance": instance['relevance_score'],
# "context_utilization": instance['utilization_score'],
# "completeness": instance['completeness_score'],
# "adherence": instance['adherence_score']
# }
# ground_truth_answer = instance_response
# print(f"Match found in {split_name} split!")
# print(f"ID: {instance_id}, Response: {instance_response}")
# break # Exit after finding the first match (optional)
# Step 1: Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
if not text1 or not text2: # Check for empty or None values
print("Error: One or both input texts are empty. Returning similarity as 0.")
return 0.0
vectorizer = TfidfVectorizer(stop_words="english")
try:
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
return similarity
except ValueError as e:
print(f"Error in vectorization: {e}. Returning similarity as 0.")
return 0.0
# Step 2: Metric 1 - Context Relevance
def context_relevance(question, relevant_documents):
# combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
return compute_cosine_similarity(question, combined_docs)
# Step 3: Metric 2 - Context Utilization
def context_utilization(response, relevant_documents):
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
return compute_cosine_similarity(response, combined_docs)
# Step 4: Metric 3 - Completeness
def completeness(response, ground_truth_answer):
return compute_cosine_similarity(response, ground_truth_answer)
# Step 5: Metric 4 - Adherence
def adherence(response, relevant_documents):
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
response_tokens = set(response.split())
relevant_tokens = set(combined_docs.split())
supported_tokens = response_tokens.intersection(relevant_tokens)
return len(supported_tokens) / len(response_tokens)
# Step 6: Compute RMSE for metrics
def compute_rmse(predicted_values, ground_truth_values):
# Ensure that both predicted_values and ground_truth_values are numeric
if all(isinstance(i, (int, float)) for i in predicted_values) and all(isinstance(i, (int, float)) for i in ground_truth_values):
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
else:
print("Invalid input for RMSE calculation. Ensure all values are numeric.")
return None
def retrieve_ground_truths(question, dataset):
"""Retrieve the ground truth answer for a given question from the dataset."""
for split_name, instances in dataset.items():
for instance in instances:
if instance['question'] == question:
return instance['response'] # Return the ground truth response immediately
return None # Return None if no match is found
def calculate_metrics(question, q_dataset, response, docs, time_taken):
data = load_query_dataset(q_dataset)
ground_truth_answer = retrieve_ground_truths(question, data) # Store the ground truth answer
# Ensure ground_truth_answer is not empty before proceeding
if ground_truth_answer is None:
ground_truth_answer = "" # Default to an empty string if no ground truth is found
# Convert ground truth to numeric form (e.g., using cosine similarity or some metric)
# Here, let's assume completeness is based on cosine similarity between the response and the ground truth
ground_truth_completeness = compute_cosine_similarity(response, ground_truth_answer)
# Predicted metrics
predicted_metrics = {
"RAG_model_response": response,
"ground_truth": ground_truth_answer,
"context_relevance": context_relevance(question, docs),
"context_utilization": context_utilization(response, docs),
"completeness": compute_cosine_similarity(response, ground_truth_answer), #completeness(response, ground_truth_answer),
"adherence": adherence(response, docs),
"response_time": time_taken
}
# Now, make sure the values passed to RMSE calculation are numeric
predicted_completeness = predicted_metrics['completeness']
# Ensure both predicted_completeness and ground_truth_completeness are numeric before calculating RMSE
if isinstance(predicted_completeness, (int, float)) and isinstance(ground_truth_completeness, (int, float)):
rmse_value = compute_rmse([predicted_completeness], [ground_truth_completeness])
predicted_metrics["rmse"] = rmse_value # Adding RMSE to metrics
else:
predicted_metrics["rmse"] = "Invalid RMSE calculation"
return predicted_metrics
''' def retrieve_ground_truths(question, dataset):
for split_name, instances in dataset.items():
print(f"Processing {split_name} split")
for instance in instances:
if instance['question'] == question:
instance_id = instance['id']
instance_response = instance['response']
# ground_truth_metrics = {
# "context_relevance": instance['relevance_score'],
# "context_utilization": instance['utilization_score'],
# "completeness": instance['completeness_score'],
# "adherence": instance['adherence_score']
# }
print(f"Match found in {split_name} split!")
print(f"ID: {instance_id}, Response: {instance_response}")
return instance_response # Return ground truth response immediately
return None # Return None if no match is found
'''