from ollama_mcp_client import MCPClient from datasets import load_dataset, Dataset import pandas as pd import os import requests import json import time test = load_dataset("jdaddyalbs/playwright-mcp-toolcalling", data_files="data/test.parquet")['train'] eval_models = [ "hf.co/jdaddyalbs/qwen3_sft_playwright_gguf:latest", "hf.co/unsloth/Qwen3-4B-GGUF:Q8_0", ] eval_part1_results = 'results.jsonl' # this should be generated before you run this script output_file = "eval_results.csv" llm_results_file = 'eval_llm_results.parquet' # We use an ollama model to grade the answers of each evaluated model OLLAMA_API_URL = "http://localhost:11434/api/generate" OLLAMA_MODEL_TO_USE = "qwen3:32b" # You can change this to "mistral", "phi3", etc. def compare_with_llm(text1: str, text2: str, query: str, model_name: str = "llama3", max_retries: int = 5, initial_delay: int = 1) -> bool: """ Compares two texts (answer and true_answer) using a locally running Ollama LLM to determine if they are semantically equivalent, given a specific query. Implements exponential backoff for API call retries. Args: text1 (str): The 'answer' text to compare. text2 (str): The 'true_answer' text to compare. query (str): The contextual query to consider during comparison. model_name (str): The name of the Ollama model to use (e.g., "llama3", "mistral"). Ensure this model is pulled and running in Ollama. max_retries (int): The maximum number of times to retry the API call. initial_delay (int): The initial delay in seconds before the first retry. Returns: bool: True if the LLM determines the texts are semantically equivalent in the context of the query, False otherwise. """ # The prompt now includes the query for contextual comparison prompt = f"""Given the following query, determine if Text 1 and Text 2 are semantically equivalent. Consider the context provided by the query when making your decision. Ignore minor differences in punctuation, capitalization, or common phrasing unless they significantly change the meaning. Respond ONLY with a JSON object containing a single key 'are_same' with a boolean value (true or false). Do NOT include any other text or explanation. Query: '{query}' Text 1: '{text1}' Text 2: '{text2}' """ # Ollama's generate endpoint payload payload = { "model": model_name, "prompt": prompt, "stream": False, "format": "json" } headers = {"Content-Type": "application/json"} for attempt in range(max_retries): try: response = requests.post(OLLAMA_API_URL, headers=headers, data=json.dumps(payload)) response.raise_for_status() result = response.json() if result and result.get("response"): llm_text_response = result["response"] try: parsed_json = json.loads(llm_text_response) return parsed_json.get("are_same", False) except json.JSONDecodeError: print(f"Warning: Ollama LLM returned non-JSON response in 'response' field: '{llm_text_response}'.") return "true" in llm_text_response.lower() else: print(f"Warning: Unexpected Ollama response structure: {result}") return False except requests.exceptions.ConnectionError: print(f"Connection to Ollama server failed. Is Ollama running at {OLLAMA_API_URL}? Please ensure it's active.") if attempt < max_retries - 1: delay = initial_delay * (2 ** attempt) print(f"Retrying in {delay:.2f} seconds...") time.sleep(delay) else: print(f"Max retries reached. Could not connect to Ollama after {max_retries} attempts.") return False except requests.exceptions.RequestException as e: if attempt < max_retries - 1: delay = initial_delay * (2 ** attempt) print(f"API request failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.2f} seconds...") time.sleep(delay) else: print(f"API request failed after {max_retries} attempts: {e}") return False return False def apply_llm_comparison_to_dataset(dataset: Dataset, ollama_model: str = "llama3") -> Dataset: """ Applies the LLM comparison function to the 'answer' and 'true_answer' columns of a Hugging Face Dataset, considering a 'query' column, and adds a new 'llm_match' column using an Ollama model. Args: dataset (Dataset): The input Hugging Face Dataset with 'answer', 'true_answer', and 'query' columns. ollama_model (str): The name of the Ollama model to use. Returns: Dataset: The Dataset with an additional 'llm_match' column. """ print(f"Applying Ollama LLM comparison using model '{ollama_model}' to each example in the dataset...") def process_example(example): example['llm_match'] = compare_with_llm( example['answer'], example['true_answer'], example['query'], # Pass the query to the comparison function model_name=ollama_model ) return example processed_dataset = dataset.map(process_example) print("Ollama LLM comparison applied.") return processed_dataset df = pd.read_json("results.jsonl",lines=True) df['answer'] = df['answer'].apply(lambda x: str(x)) results = Dataset.from_pandas(df) # Apply the LLM comparison function for grading the answers # if already ran, load file instead #llm_results = Dataset.from_parquet('eval_llm_results.parquet') # if you have generated this file already, comment out the next two lines llm_results = apply_llm_comparison_to_dataset(results, ollama_model=OLLAMA_MODEL_TO_USE) llm_results.to_parquet(llm_results_file) # get the samples which were graded as correct answer matched = llm_results.filter(lambda x: x['llm_match']) used_tools = llm_results.filter(lambda x: x['tool_calls'] > 0) matched_with_tools = matched.filter(lambda x: x['tool_calls'] > 0) with open(output_file,"w") as outfile: outfile.write("model,eval samples,num correct,num with tools,num correct with tools\n") for model in eval_models: eval_samples = len(llm_results.filter(lambda x: x['model']==model)) num_correct = len(matched.filter(lambda x: x['model']==model)) num_w_tools = len(used_tools.filter(lambda x: x['model']==model)) num_correct_w_tools = len(matched_with_tools.filter(lambda x: x['model']==model)) outfile.write(f"{model},{eval_samples},{num_correct},{num_w_tools},{num_correct_w_tools}\n")