File size: 3,632 Bytes
1b04b96
 
 
 
 
 
a523549
 
1b04b96
 
 
 
a523549
 
 
1b04b96
 
 
 
 
9bfc86c
 
1b04b96
 
 
43b460f
1b04b96
43b460f
 
1b04b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from data_processing import load_ragbench

ground_truth_answer = ''
ground_truth_metrics = {}


def calculate_metrics(question, response, docs, time_taken):
    data =  load_ragbench()
    retrieve_ground_truths(question, data)
    # Predicted metrics
    predicted_metrics = {
        "context_relevance": context_relevance(question, docs),
        "context_utilization": context_utilization(response, docs),
        "completeness": completeness(response, ground_truth_answer),
        "adherence": adherence(response, docs),
        "response_time" : time_taken
    }
    return predicted_metrics

def retrieve_ground_truths(question,ragbench_set):
    # Iterate through all splits (train, test, validation)
    for dataset_name in ragbench_set.keys():
        for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
            print(f"Processing {split_name} split")
            for instance in instances: # Fixed: Corrected indentation
                # Check if the question (data) matches the query
                if instance['question'] == question:
                    # If a match is found, retrieve id and response
                    instance_id = instance['id']
                    instance_response = instance['response']
                    ground_truth_metrics = {
                        "context_relevance": instance['relevance_score'],
                        "context_utilization": instance['utilization_score'],
                        "completeness": instance['completeness_score'],
                        "adherence": instance['adherence_score']
                    }
                    ground_truth_answer = instance_response
                    print(f"Match found in {split_name} split!")
                    print(f"ID: {instance_id}, Response: {instance_response}")
                    break  # Exit after finding the first match (optional)

# Step 1: Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

# Step 2: Metric 1 - Context Relevance
def context_relevance(question, relevant_documents):
    combined_docs = " ".join([doc.page_content for doc in relevant_documents])
    return compute_cosine_similarity(question, combined_docs)

# Step 3: Metric 2 - Context Utilization
def context_utilization(response, relevant_documents):
    combined_docs = " ".join([doc.page_content for doc in relevant_documents])
    return compute_cosine_similarity(response, combined_docs)

# Step 4: Metric 3 - Completeness
def completeness(response, ground_truth_answer):
    return compute_cosine_similarity(response, ground_truth_answer)

# Step 5: Metric 4 - Adherence
def adherence(response, relevant_documents):
    combined_docs = " ".join([doc.page_content for doc in relevant_documents])
    response_tokens = set(response.split())
    relevant_tokens = set(combined_docs.split())
    supported_tokens = response_tokens.intersection(relevant_tokens)
    return len(supported_tokens) / len(response_tokens)

# Step 6: Compute RMSE for metrics
def compute_rmse(predicted_values, ground_truth_values):
    return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))