import pytest from deepeval import assert_test from deepeval.metrics import AnswerRelevancyMetric from deepeval.test_case import LLMTestCase import pandas as pd import os from agent import get_agent from semscore import EmbeddingModelWrapper import logging from tqdm import tqdm from transformers.agents import agent_types def test_case(): answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5) test_case = LLMTestCase( input="What if these shoes don't fit?", # Replace this with the actual output from your LLM application actual_output="We offer a 30-day full refund at no extra costs.", retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."] ) assert_test(test_case, [answer_relevancy_metric]) def test_default_agent(): SAMPLES_DIR = "samples" os.makedirs(SAMPLES_DIR, exist_ok=True) dfSample = pd.read_pickle(os.path.join(SAMPLES_DIR, f"samples.pkl")) agent = get_agent() # Suppress logging from the agent, which can be quite verbose agent.logger.setLevel(logging.CRITICAL) answers_ref = [] answers_pred = [] for title, context, question, answer, synthesized_question in tqdm(dfSample.values): class Output: output: agent_types.AgentType | str = None prompt = synthesized_question answers_ref.append(answer) final_answer = agent.run(prompt, stream=False, reset=True) answers_pred.append(final_answer) answers_ref = [str(answer) for answer in answers_ref] answers_pred = [str(answer) for answer in answers_pred] em = EmbeddingModelWrapper() similarities = em.get_similarities( em.get_embeddings( answers_pred ), em.get_embeddings( answers_ref ), ) mean_similarity = similarities.mean() assert(mean_similarity >= 0.5, f"Mean similarity is too low: {mean_similarity}")