Spaces:
Sleeping
Sleeping
File size: 4,899 Bytes
1b04b96 da626d3 a523549 3309fe6 1b04b96 c4f2afd 1b04b96 c4f2afd 1b04b96 c4f2afd 1b04b96 c4f2afd 1b04b96 0e36212 1b04b96 454c8e8 1b04b96 fdc80c8 0e36212 58a211a 0e36212 75c991a 0e36212 1f55569 ece1395 0e36212 58a211a 0e36212 58a211a 9665824 ece1395 9665824 0e36212 454c8e8 0e36212 9665824 0e36212 1b04b96 0e36212 ece1395 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from data_processing import load_query_dataset
global ground_truth_answer, ground_truth_metrics
ground_truth_answer = ''
ground_truth_metrics = {}
# Step 1: Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
if not text1 or not text2: # Check for empty or None values
print("Error: One or both input texts are empty. Returning similarity as 0.")
return 0.0
vectorizer = TfidfVectorizer(stop_words="english")
try:
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
return similarity
except ValueError as e:
print(f"Error in vectorization: {e}. Returning similarity as 0.")
return 0.0
# Step 2: Metric 1 - Context Relevance
def context_relevance(question, relevant_documents):
# combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
return compute_cosine_similarity(question, combined_docs)
# Step 3: Metric 2 - Context Utilization
def context_utilization(response, relevant_documents):
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
return compute_cosine_similarity(response, combined_docs)
# Step 4: Metric 3 - Completeness
def completeness(response, ground_truth_answer):
return compute_cosine_similarity(response, ground_truth_answer)
# Step 5: Metric 4 - Adherence
def adherence(response, relevant_documents):
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
combined_docs = " ".join([doc for doc in relevant_documents])
response_tokens = set(response.split())
relevant_tokens = set(combined_docs.split())
supported_tokens = response_tokens.intersection(relevant_tokens)
return len(supported_tokens) / len(response_tokens) >= 0.5
# Step 6: Compute RMSE for metrics
def compute_rmse(predicted_values, ground_truth_values):
predicted_ = [float(v) for v in predicted_values.values()]
ground_truth_ = [float(v) if isinstance(v, (int, float)) else 0.75 if v is True else 0.25 for v in ground_truth_values.values()]
return np.sqrt(mean_squared_error(ground_truth_, predicted_))
def calculate_metrics(question, q_dataset, response, docs, time_taken):
data = load_query_dataset(q_dataset)
ground_truth_answer, ground_truth_metrics = retrieve_ground_truths(question, data) # Store the ground truth answer
# Ensure ground_truth_answer is not empty before proceeding
if ground_truth_answer is None:
ground_truth_answer = "" # Default to an empty string if no ground truth is found
# Predicted metrics
predicted_metrics = {
"context_relevance": context_relevance(question, docs),
"context_utilization": context_utilization(response, docs),
"completeness": completeness(response, ground_truth_answer),
"adherence": adherence(response, docs),
}
rmse = compute_rmse(predicted_metrics, ground_truth_metrics)
metrics = {
"RMSE": rmse,
"predicted_metrics":predicted_metrics,
"response_time": time_taken,
"ground_truth": ground_truth_answer,
"RAG_model_response": response,
}
return metrics
def retrieve_ground_truths(question, dataset):
for split_name, instances in dataset.items():
print(f"Processing {split_name} split")
for instance in instances:
#if instance['question'] == question:
if is_similar(instance['question'], question):
instance_id = instance['id']
ground_truth = instance['response']
ground_truth_metrics_ = {
"context_relevance": instance['relevance_score'],
"context_utilization": instance['utilization_score'],
"completeness": instance['completeness_score'],
"adherence": instance['adherence_score']
}
print(f"Match found in {split_name} split!")
print(f"ID: {instance_id}, Response: {ground_truth}")
return ground_truth , ground_truth_metrics_ # Return ground truth response immediately
return None, None
def is_similar(question1, question2, threshold=0.85):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([question1, question2])
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
return similarity >= threshold |