File size: 1,911 Bytes
60d9d3a
 
 
 
e1ed8d0
 
 
 
 
 
 
60d9d3a
 
 
 
 
 
 
 
 
e1ed8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pytest
from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
import pandas as pd
import os
from agent import get_agent
from semscore import EmbeddingModelWrapper
import logging
from tqdm import tqdm
from transformers.agents import agent_types

def test_case():
    answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
    test_case = LLMTestCase(
        input="What if these shoes don't fit?",
        # Replace this with the actual output from your LLM application
        actual_output="We offer a 30-day full refund at no extra costs.",
        retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
    )
    assert_test(test_case, [answer_relevancy_metric])


def test_default_agent():
    SAMPLES_DIR = "samples"
    os.makedirs(SAMPLES_DIR, exist_ok=True)
    dfSample = pd.read_pickle(os.path.join(SAMPLES_DIR, f"samples.pkl"))
    agent = get_agent()
    # Suppress logging from the agent, which can be quite verbose
    agent.logger.setLevel(logging.CRITICAL)
    answers_ref = []
    answers_pred = []
    for title, context, question, answer, synthesized_question in tqdm(dfSample.values):
        class Output:
            output: agent_types.AgentType | str = None

        prompt = synthesized_question
        answers_ref.append(answer)
        final_answer = agent.run(prompt, stream=False, reset=True)
        answers_pred.append(final_answer)

    answers_ref = [str(answer) for answer in answers_ref]
    answers_pred = [str(answer) for answer in answers_pred]

    em = EmbeddingModelWrapper()
    similarities = em.get_similarities(
        em.get_embeddings( answers_pred ),
        em.get_embeddings( answers_ref ),
    )
    mean_similarity = similarities.mean()

    assert(mean_similarity >= 0.5, f"Mean similarity is too low: {mean_similarity}")