File size: 6,895 Bytes
efd85c2 5fcd0a0 efd85c2 5fcd0a0 efd85c2 5fcd0a0 efd85c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from ollama_mcp_client import MCPClient
from datasets import load_dataset, Dataset
import pandas as pd
import os
import requests
import json
import time
test = load_dataset("jdaddyalbs/playwright-mcp-toolcalling", data_files="data/test.parquet")['train']
eval_models = [
"hf.co/jdaddyalbs/qwen3_sft_playwright_gguf:latest",
"hf.co/unsloth/Qwen3-4B-GGUF:Q8_0",
]
eval_part1_results = 'results.jsonl' # this should be generated before you run this script
output_file = "eval_results.csv"
llm_results_file = 'eval_llm_results.parquet'
# We use an ollama model to grade the answers of each evaluated model
OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL_TO_USE = "qwen3:32b" # You can change this to "mistral", "phi3", etc.
def compare_with_llm(text1: str, text2: str, query: str, model_name: str = "llama3", max_retries: int = 5, initial_delay: int = 1) -> bool:
"""
Compares two texts (answer and true_answer) using a locally running Ollama LLM
to determine if they are semantically equivalent, given a specific query.
Implements exponential backoff for API call retries.
Args:
text1 (str): The 'answer' text to compare.
text2 (str): The 'true_answer' text to compare.
query (str): The contextual query to consider during comparison.
model_name (str): The name of the Ollama model to use (e.g., "llama3", "mistral").
Ensure this model is pulled and running in Ollama.
max_retries (int): The maximum number of times to retry the API call.
initial_delay (int): The initial delay in seconds before the first retry.
Returns:
bool: True if the LLM determines the texts are semantically equivalent
in the context of the query, False otherwise.
"""
# The prompt now includes the query for contextual comparison
prompt = f"""Given the following query, determine if Text 1 and Text 2 are semantically equivalent.
Consider the context provided by the query when making your decision.
Ignore minor differences in punctuation, capitalization, or common phrasing unless they significantly change the meaning.
Respond ONLY with a JSON object containing a single key 'are_same' with a boolean value (true or false).
Do NOT include any other text or explanation.
Query: '{query}'
Text 1: '{text1}'
Text 2: '{text2}'
"""
# Ollama's generate endpoint payload
payload = {
"model": model_name,
"prompt": prompt,
"stream": False,
"format": "json"
}
headers = {"Content-Type": "application/json"}
for attempt in range(max_retries):
try:
response = requests.post(OLLAMA_API_URL, headers=headers, data=json.dumps(payload))
response.raise_for_status()
result = response.json()
if result and result.get("response"):
llm_text_response = result["response"]
try:
parsed_json = json.loads(llm_text_response)
return parsed_json.get("are_same", False)
except json.JSONDecodeError:
print(f"Warning: Ollama LLM returned non-JSON response in 'response' field: '{llm_text_response}'.")
return "true" in llm_text_response.lower()
else:
print(f"Warning: Unexpected Ollama response structure: {result}")
return False
except requests.exceptions.ConnectionError:
print(f"Connection to Ollama server failed. Is Ollama running at {OLLAMA_API_URL}? Please ensure it's active.")
if attempt < max_retries - 1:
delay = initial_delay * (2 ** attempt)
print(f"Retrying in {delay:.2f} seconds...")
time.sleep(delay)
else:
print(f"Max retries reached. Could not connect to Ollama after {max_retries} attempts.")
return False
except requests.exceptions.RequestException as e:
if attempt < max_retries - 1:
delay = initial_delay * (2 ** attempt)
print(f"API request failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.2f} seconds...")
time.sleep(delay)
else:
print(f"API request failed after {max_retries} attempts: {e}")
return False
return False
def apply_llm_comparison_to_dataset(dataset: Dataset, ollama_model: str = "llama3") -> Dataset:
"""
Applies the LLM comparison function to the 'answer' and 'true_answer' columns
of a Hugging Face Dataset, considering a 'query' column, and adds a new 'llm_match' column
using an Ollama model.
Args:
dataset (Dataset): The input Hugging Face Dataset with 'answer', 'true_answer',
and 'query' columns.
ollama_model (str): The name of the Ollama model to use.
Returns:
Dataset: The Dataset with an additional 'llm_match' column.
"""
print(f"Applying Ollama LLM comparison using model '{ollama_model}' to each example in the dataset...")
def process_example(example):
example['llm_match'] = compare_with_llm(
example['answer'],
example['true_answer'],
example['query'], # Pass the query to the comparison function
model_name=ollama_model
)
return example
processed_dataset = dataset.map(process_example)
print("Ollama LLM comparison applied.")
return processed_dataset
df = pd.read_json("results.jsonl",lines=True)
df['answer'] = df['answer'].apply(lambda x: str(x))
results = Dataset.from_pandas(df)
# Apply the LLM comparison function for grading the answers
# if already ran, load file instead
#llm_results = Dataset.from_parquet('eval_llm_results.parquet')
# if you have generated this file already, comment out the next two lines
llm_results = apply_llm_comparison_to_dataset(results, ollama_model=OLLAMA_MODEL_TO_USE)
llm_results.to_parquet(llm_results_file)
# get the samples which were graded as correct answer
matched = llm_results.filter(lambda x: x['llm_match'])
used_tools = llm_results.filter(lambda x: x['tool_calls'] > 0)
matched_with_tools = matched.filter(lambda x: x['tool_calls'] > 0)
with open(output_file,"w") as outfile:
outfile.write("model,eval samples,num correct,num with tools,num correct with tools\n")
for model in eval_models:
eval_samples = len(llm_results.filter(lambda x: x['model']==model))
num_correct = len(matched.filter(lambda x: x['model']==model))
num_w_tools = len(used_tools.filter(lambda x: x['model']==model))
num_correct_w_tools = len(matched_with_tools.filter(lambda x: x['model']==model))
outfile.write(f"{model},{eval_samples},{num_correct},{num_w_tools},{num_correct_w_tools}\n")
|