Spaces:

vonliechti
/

SQuAD_Agent_Experiment

Running

File size: 1,625 Bytes

60d9d3a
 
 
e1ed8d0
f4644e9
e1ed8d0
60d9d3a
f4644e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1ed8d0
f4644e9

from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from agent import get_agent
from prompts import FOCUSED_SQUAD_REACT_CODE_SYSTEM_PROMPT
import logging

"""
Test the chatbot's ability to carry out multi-turn conversations, 
adapt to context, and handle a variety of topics.
"""
def test_chatbot_goals():
    user_messages = [
        "What is on top of the Notre Dame building?",
        "When did the United States purchase Alaska from Russia?",
        "What year did Bern join the Swiss Confederacy?",
        "Are there any other statues nearby the first one you mentioned?",
    ]
    minimum_acceptable_answers = [
        "golden statue of the Virgin Mary",
        "1867",
        "1353",
        "copper statue of Christ"
    ]
    agent = get_agent(system_prompt=FOCUSED_SQUAD_REACT_CODE_SYSTEM_PROMPT)
    agent.logger.setLevel(logging.CRITICAL)
    for i, (user_message, minimum_acceptable_answer) in enumerate(zip(user_messages, minimum_acceptable_answers)):
        answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
        reset = (i == 0) # Reset the agent for the first message only
        print(f"Running with reset={reset}")
        answer = agent.run(user_message, stream=False, reset=reset)
        print(f"User message: {user_message}")
        print(f"Minimum acceptable answer: {minimum_acceptable_answer}")
        print(f"Answer: {answer}")
        test_case = LLMTestCase(
            input=user_message,
            actual_output=answer,
        )
        assert_test(test_case, [answer_relevancy_metric])