|
from ollama_mcp_client import MCPClient |
|
from datasets import load_dataset, Dataset |
|
import pandas as pd |
|
import os |
|
import requests |
|
import json |
|
import time |
|
|
|
test = load_dataset("jdaddyalbs/playwright-mcp-toolcalling", data_files="data/test.parquet")['train'] |
|
|
|
eval_models = [ |
|
"hf.co/jdaddyalbs/qwen3_sft_playwright_gguf:latest", |
|
"hf.co/unsloth/Qwen3-4B-GGUF:Q8_0", |
|
] |
|
eval_part1_results = 'results.jsonl' |
|
output_file = "eval_results.csv" |
|
llm_results_file = 'eval_llm_results.parquet' |
|
|
|
OLLAMA_API_URL = "http://localhost:11434/api/generate" |
|
OLLAMA_MODEL_TO_USE = "qwen3:32b" |
|
|
|
def compare_with_llm(text1: str, text2: str, query: str, model_name: str = "llama3", max_retries: int = 5, initial_delay: int = 1) -> bool: |
|
""" |
|
Compares two texts (answer and true_answer) using a locally running Ollama LLM |
|
to determine if they are semantically equivalent, given a specific query. |
|
Implements exponential backoff for API call retries. |
|
|
|
Args: |
|
text1 (str): The 'answer' text to compare. |
|
text2 (str): The 'true_answer' text to compare. |
|
query (str): The contextual query to consider during comparison. |
|
model_name (str): The name of the Ollama model to use (e.g., "llama3", "mistral"). |
|
Ensure this model is pulled and running in Ollama. |
|
max_retries (int): The maximum number of times to retry the API call. |
|
initial_delay (int): The initial delay in seconds before the first retry. |
|
|
|
Returns: |
|
bool: True if the LLM determines the texts are semantically equivalent |
|
in the context of the query, False otherwise. |
|
""" |
|
|
|
prompt = f"""Given the following query, determine if Text 1 and Text 2 are semantically equivalent. |
|
Consider the context provided by the query when making your decision. |
|
Ignore minor differences in punctuation, capitalization, or common phrasing unless they significantly change the meaning. |
|
|
|
Respond ONLY with a JSON object containing a single key 'are_same' with a boolean value (true or false). |
|
Do NOT include any other text or explanation. |
|
|
|
Query: '{query}' |
|
Text 1: '{text1}' |
|
Text 2: '{text2}' |
|
""" |
|
|
|
|
|
payload = { |
|
"model": model_name, |
|
"prompt": prompt, |
|
"stream": False, |
|
"format": "json" |
|
} |
|
headers = {"Content-Type": "application/json"} |
|
|
|
for attempt in range(max_retries): |
|
try: |
|
response = requests.post(OLLAMA_API_URL, headers=headers, data=json.dumps(payload)) |
|
response.raise_for_status() |
|
|
|
result = response.json() |
|
|
|
if result and result.get("response"): |
|
llm_text_response = result["response"] |
|
try: |
|
parsed_json = json.loads(llm_text_response) |
|
return parsed_json.get("are_same", False) |
|
except json.JSONDecodeError: |
|
print(f"Warning: Ollama LLM returned non-JSON response in 'response' field: '{llm_text_response}'.") |
|
return "true" in llm_text_response.lower() |
|
|
|
else: |
|
print(f"Warning: Unexpected Ollama response structure: {result}") |
|
return False |
|
|
|
except requests.exceptions.ConnectionError: |
|
print(f"Connection to Ollama server failed. Is Ollama running at {OLLAMA_API_URL}? Please ensure it's active.") |
|
if attempt < max_retries - 1: |
|
delay = initial_delay * (2 ** attempt) |
|
print(f"Retrying in {delay:.2f} seconds...") |
|
time.sleep(delay) |
|
else: |
|
print(f"Max retries reached. Could not connect to Ollama after {max_retries} attempts.") |
|
return False |
|
except requests.exceptions.RequestException as e: |
|
if attempt < max_retries - 1: |
|
delay = initial_delay * (2 ** attempt) |
|
print(f"API request failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.2f} seconds...") |
|
time.sleep(delay) |
|
else: |
|
print(f"API request failed after {max_retries} attempts: {e}") |
|
return False |
|
|
|
return False |
|
|
|
def apply_llm_comparison_to_dataset(dataset: Dataset, ollama_model: str = "llama3") -> Dataset: |
|
""" |
|
Applies the LLM comparison function to the 'answer' and 'true_answer' columns |
|
of a Hugging Face Dataset, considering a 'query' column, and adds a new 'llm_match' column |
|
using an Ollama model. |
|
|
|
Args: |
|
dataset (Dataset): The input Hugging Face Dataset with 'answer', 'true_answer', |
|
and 'query' columns. |
|
ollama_model (str): The name of the Ollama model to use. |
|
|
|
Returns: |
|
Dataset: The Dataset with an additional 'llm_match' column. |
|
""" |
|
print(f"Applying Ollama LLM comparison using model '{ollama_model}' to each example in the dataset...") |
|
|
|
def process_example(example): |
|
example['llm_match'] = compare_with_llm( |
|
example['answer'], |
|
example['true_answer'], |
|
example['query'], |
|
model_name=ollama_model |
|
) |
|
return example |
|
|
|
processed_dataset = dataset.map(process_example) |
|
print("Ollama LLM comparison applied.") |
|
return processed_dataset |
|
|
|
|
|
df = pd.read_json("results.jsonl",lines=True) |
|
df['answer'] = df['answer'].apply(lambda x: str(x)) |
|
results = Dataset.from_pandas(df) |
|
|
|
|
|
|
|
|
|
|
|
llm_results = apply_llm_comparison_to_dataset(results, ollama_model=OLLAMA_MODEL_TO_USE) |
|
llm_results.to_parquet(llm_results_file) |
|
|
|
|
|
matched = llm_results.filter(lambda x: x['llm_match']) |
|
used_tools = llm_results.filter(lambda x: x['tool_calls'] > 0) |
|
matched_with_tools = matched.filter(lambda x: x['tool_calls'] > 0) |
|
with open(output_file,"w") as outfile: |
|
outfile.write("model,eval samples,num correct,num with tools,num correct with tools\n") |
|
for model in eval_models: |
|
eval_samples = len(llm_results.filter(lambda x: x['model']==model)) |
|
num_correct = len(matched.filter(lambda x: x['model']==model)) |
|
num_w_tools = len(used_tools.filter(lambda x: x['model']==model)) |
|
num_correct_w_tools = len(matched_with_tools.filter(lambda x: x['model']==model)) |
|
outfile.write(f"{model},{eval_samples},{num_correct},{num_w_tools},{num_correct_w_tools}\n") |
|
|
|
|