File size: 6,895 Bytes

from ollama_mcp_client import MCPClient
from datasets import load_dataset, Dataset
import pandas as pd
import os
import requests
import json
import time

test = load_dataset("jdaddyalbs/playwright-mcp-toolcalling", data_files="data/test.parquet")['train']

eval_models = [
    "hf.co/jdaddyalbs/qwen3_sft_playwright_gguf:latest",
    "hf.co/unsloth/Qwen3-4B-GGUF:Q8_0",
]
eval_part1_results = 'results.jsonl' # this should be generated before you run this script
output_file = "eval_results.csv"
llm_results_file = 'eval_llm_results.parquet'
# We use an ollama model to grade the answers of each evaluated model
OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL_TO_USE = "qwen3:32b" # You can change this to "mistral", "phi3", etc.

def compare_with_llm(text1: str, text2: str, query: str, model_name: str = "llama3", max_retries: int = 5, initial_delay: int = 1) -> bool:
    """
    Compares two texts (answer and true_answer) using a locally running Ollama LLM
    to determine if they are semantically equivalent, given a specific query.
    Implements exponential backoff for API call retries.

    Args:
        text1 (str): The 'answer' text to compare.
        text2 (str): The 'true_answer' text to compare.
        query (str): The contextual query to consider during comparison.
        model_name (str): The name of the Ollama model to use (e.g., "llama3", "mistral").
                          Ensure this model is pulled and running in Ollama.
        max_retries (int): The maximum number of times to retry the API call.
        initial_delay (int): The initial delay in seconds before the first retry.

    Returns:
        bool: True if the LLM determines the texts are semantically equivalent
              in the context of the query, False otherwise.
    """
    # The prompt now includes the query for contextual comparison
    prompt = f"""Given the following query, determine if Text 1 and Text 2 are semantically equivalent.
    Consider the context provided by the query when making your decision.
    Ignore minor differences in punctuation, capitalization, or common phrasing unless they significantly change the meaning.

    Respond ONLY with a JSON object containing a single key 'are_same' with a boolean value (true or false).
    Do NOT include any other text or explanation.

    Query: '{query}'
    Text 1: '{text1}'
    Text 2: '{text2}'
    """

    # Ollama's generate endpoint payload
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "format": "json"
    }
    headers = {"Content-Type": "application/json"}

    for attempt in range(max_retries):
        try:
            response = requests.post(OLLAMA_API_URL, headers=headers, data=json.dumps(payload))
            response.raise_for_status()

            result = response.json()

            if result and result.get("response"):
                llm_text_response = result["response"]
                try:
                    parsed_json = json.loads(llm_text_response)
                    return parsed_json.get("are_same", False)
                except json.JSONDecodeError:
                    print(f"Warning: Ollama LLM returned non-JSON response in 'response' field: '{llm_text_response}'.")
                    return "true" in llm_text_response.lower()

            else:
                print(f"Warning: Unexpected Ollama response structure: {result}")
                return False

        except requests.exceptions.ConnectionError:
            print(f"Connection to Ollama server failed. Is Ollama running at {OLLAMA_API_URL}? Please ensure it's active.")
            if attempt < max_retries - 1:
                delay = initial_delay * (2 ** attempt)
                print(f"Retrying in {delay:.2f} seconds...")
                time.sleep(delay)
            else:
                print(f"Max retries reached. Could not connect to Ollama after {max_retries} attempts.")
                return False
        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                delay = initial_delay * (2 ** attempt)
                print(f"API request failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.2f} seconds...")
                time.sleep(delay)
            else:
                print(f"API request failed after {max_retries} attempts: {e}")
                return False

    return False

def apply_llm_comparison_to_dataset(dataset: Dataset, ollama_model: str = "llama3") -> Dataset:
    """
    Applies the LLM comparison function to the 'answer' and 'true_answer' columns
    of a Hugging Face Dataset, considering a 'query' column, and adds a new 'llm_match' column
    using an Ollama model.

    Args:
        dataset (Dataset): The input Hugging Face Dataset with 'answer', 'true_answer',
                           and 'query' columns.
        ollama_model (str): The name of the Ollama model to use.

    Returns:
        Dataset: The Dataset with an additional 'llm_match' column.
    """
    print(f"Applying Ollama LLM comparison using model '{ollama_model}' to each example in the dataset...")

    def process_example(example):
        example['llm_match'] = compare_with_llm(
            example['answer'],
            example['true_answer'],
            example['query'], # Pass the query to the comparison function
            model_name=ollama_model
        )
        return example

    processed_dataset = dataset.map(process_example)
    print("Ollama LLM comparison applied.")
    return processed_dataset


df = pd.read_json("results.jsonl",lines=True)
df['answer'] = df['answer'].apply(lambda x: str(x))
results = Dataset.from_pandas(df)

# Apply the LLM comparison function for grading the answers
# if already ran, load file instead
#llm_results = Dataset.from_parquet('eval_llm_results.parquet')
# if you have generated this file already, comment out the next two lines
llm_results = apply_llm_comparison_to_dataset(results, ollama_model=OLLAMA_MODEL_TO_USE)
llm_results.to_parquet(llm_results_file)

# get the samples which were graded as correct answer
matched = llm_results.filter(lambda x: x['llm_match'])
used_tools = llm_results.filter(lambda x: x['tool_calls'] > 0)
matched_with_tools = matched.filter(lambda x: x['tool_calls'] > 0)
with open(output_file,"w") as outfile:
    outfile.write("model,eval samples,num correct,num with tools,num correct with tools\n")
    for model in eval_models:
        eval_samples = len(llm_results.filter(lambda x: x['model']==model))
        num_correct = len(matched.filter(lambda x: x['model']==model))
        num_w_tools = len(used_tools.filter(lambda x: x['model']==model))
        num_correct_w_tools = len(matched_with_tools.filter(lambda x: x['model']==model))
        outfile.write(f"{model},{eval_samples},{num_correct},{num_w_tools},{num_correct_w_tools}\n")