|
from deepeval import assert_test |
|
from deepeval.metrics import AnswerRelevancyMetric |
|
from deepeval.test_case import LLMTestCase |
|
from agent import get_agent |
|
from prompts import FOCUSED_SQUAD_REACT_CODE_SYSTEM_PROMPT |
|
import logging |
|
|
|
""" |
|
Test the chatbot's ability to carry out multi-turn conversations, |
|
adapt to context, and handle a variety of topics. |
|
""" |
|
def test_chatbot_goals(): |
|
user_messages = [ |
|
"What is on top of the Notre Dame building?", |
|
"When did the United States purchase Alaska from Russia?", |
|
"What year did Bern join the Swiss Confederacy?", |
|
"Are there any other statues nearby the first one you mentioned?", |
|
] |
|
minimum_acceptable_answers = [ |
|
"golden statue of the Virgin Mary", |
|
"1867", |
|
"1353", |
|
"copper statue of Christ" |
|
] |
|
agent = get_agent(system_prompt=FOCUSED_SQUAD_REACT_CODE_SYSTEM_PROMPT) |
|
agent.logger.setLevel(logging.CRITICAL) |
|
for i, (user_message, minimum_acceptable_answer) in enumerate(zip(user_messages, minimum_acceptable_answers)): |
|
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5) |
|
reset = (i == 0) |
|
print(f"Running with reset={reset}") |
|
answer = agent.run(user_message, stream=False, reset=reset) |
|
print(f"User message: {user_message}") |
|
print(f"Minimum acceptable answer: {minimum_acceptable_answer}") |
|
print(f"Answer: {answer}") |
|
test_case = LLMTestCase( |
|
input=user_message, |
|
actual_output=answer, |
|
) |
|
assert_test(test_case, [answer_relevancy_metric]) |
|
|