File size: 6,629 Bytes
1b04b96
 
 
 
 
 
da626d3
a523549
3309fe6
 
1b04b96
 
 
58a211a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b04b96
 
 
c4f2afd
 
 
 
 
 
 
 
 
 
 
 
 
1b04b96
 
 
c4f2afd
 
1b04b96
 
 
 
c4f2afd
 
1b04b96
 
 
 
 
 
 
 
c4f2afd
 
1b04b96
 
 
0e36212
1b04b96
 
 
 
 
fdc80c8
 
0e36212
58a211a
 
 
 
 
 
 
 
 
 
0e36212
 
 
 
 
 
 
 
ece1395
 
0e36212
58a211a
0e36212
 
58a211a
9665824
 
 
 
ece1395
 
9665824
0e36212
 
 
 
 
 
 
9665824
0e36212
 
1b04b96
0e36212
ece1395
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from data_processing import load_query_dataset

global ground_truth_answer, ground_truth_metrics

ground_truth_answer = ''
ground_truth_metrics = {}

# def calculate_metrics(question, response, docs, time_taken):
#     data =  load_ragbench()
#     retrieve_ground_truths(question, data)
#     # Predicted metrics
#     predicted_metrics = {
#         "ground_truth": ground_truth_answer,
#         "context_relevance": context_relevance(question, docs),
#         "context_utilization": context_utilization(response, docs),
#         "completeness": completeness(response, ground_truth_answer),
#         "adherence": adherence(response, docs),
#         "response_time" : time_taken
#     }
#     return predicted_metrics

# def retrieve_ground_truths(question,ragbench_set):
#     for dataset_name in ragbench_set.keys():
#         for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
#             print(f"Processing {split_name} split")
#             for instance in instances: # Fixed: Corrected indentation
#                 # Check if the question (data) matches the query
#                 if instance['question'] == question:
#                     # If a match is found, retrieve id and response
#                     instance_id = instance['id']
#                     instance_response = instance['response']
#                     ground_truth_metrics = {
#                         "context_relevance": instance['relevance_score'],
#                         "context_utilization": instance['utilization_score'],
#                         "completeness": instance['completeness_score'],
#                         "adherence": instance['adherence_score']
#                     }
#                     ground_truth_answer = instance_response
#                     print(f"Match found in {split_name} split!")
#                     print(f"ID: {instance_id}, Response: {instance_response}")
#                     break  # Exit after finding the first match (optional)

# Step 1: Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
    if not text1 or not text2:  # Check for empty or None values
        print("Error: One or both input texts are empty. Returning similarity as 0.")
        return 0.0

    vectorizer = TfidfVectorizer(stop_words="english")

    try:
        vectors = vectorizer.fit_transform([text1, text2])
        similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
        return similarity
    except ValueError as e:
        print(f"Error in vectorization: {e}. Returning similarity as 0.")
        return 0.0

# Step 2: Metric 1 - Context Relevance
def context_relevance(question, relevant_documents):
    # combined_docs = " ".join([doc.page_content for doc in relevant_documents])
    combined_docs = " ".join([doc for doc in relevant_documents])
    return compute_cosine_similarity(question, combined_docs)

# Step 3: Metric 2 - Context Utilization
def context_utilization(response, relevant_documents):
    #combined_docs = " ".join([doc.page_content for doc in relevant_documents])
    combined_docs = " ".join([doc for doc in relevant_documents])
    return compute_cosine_similarity(response, combined_docs)

# Step 4: Metric 3 - Completeness
def completeness(response, ground_truth_answer):
    return compute_cosine_similarity(response, ground_truth_answer)

# Step 5: Metric 4 - Adherence
def adherence(response, relevant_documents):
    #combined_docs = " ".join([doc.page_content for doc in relevant_documents])
    combined_docs = " ".join([doc for doc in relevant_documents])
    response_tokens = set(response.split())
    relevant_tokens = set(combined_docs.split())
    supported_tokens = response_tokens.intersection(relevant_tokens)
    return len(supported_tokens) / len(response_tokens) >= 0.5

# Step 6: Compute RMSE for metrics
def compute_rmse(predicted_values, ground_truth_values):
    return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))

def calculate_metrics(question, q_dataset, response, docs, time_taken):
    data = load_query_dataset(q_dataset)
    ground_truth_answer, ground_truth_metrics = retrieve_ground_truths(question, data)  # Store the ground truth answer

    # Ensure ground_truth_answer is not empty before proceeding
    if ground_truth_answer is None:
        ground_truth_answer = ""  # Default to an empty string if no ground truth is found

    # Predicted metrics
    predicted_metrics = {
        "context_relevance": context_relevance(question, docs),
        "context_utilization": context_utilization(response, docs),
        "completeness": completeness(response, ground_truth_answer),
        "adherence": adherence(response, docs),  
    }

    rmse = compute_rmse(predicted_metrics, ground_truth_metrics),

    metrics = {
        "RMSE": rmse,
        "metrics":predicted_metrics,
        "response_time": time_taken,
        "ground_truth": ground_truth_answer,
        "RAG_model_response": response, 
    }

    return metrics

def retrieve_ground_truths(question, dataset):
    for split_name, instances in dataset.items():
        print(f"Processing {split_name} split")
        for instance in instances:
            #if instance['question'] == question:
            if is_similar(instance['question'], question):
                instance_id = instance['id']
                ground_truth = instance['response']
                ground_truth_metrics_ = {
                    "context_relevance": instance['relevance_score'],
                    "context_utilization": instance['utilization_score'],
                    "completeness": instance['completeness_score'],
                    "adherence": instance['adherence_score']
                }
                print(f"Match found in {split_name} split!")
                print(f"ID: {instance_id}, Response: {ground_truth}")
                return ground_truth , ground_truth_metrics_  # Return ground truth response immediately

    return None, None

def is_similar(question1, question2, threshold=0.85):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([question1, question2])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity >= threshold