qwen3_sft_playwright_gguf / evaluate-part2-v1.py

Upload evaluate-part2-v1.py

5fcd0a0 verified about 1 month ago

6.9 kB

	from ollama_mcp_client import MCPClient
	from datasets import load_dataset, Dataset
	import pandas as pd
	import os
	import requests
	import json
	import time

	test = load_dataset("jdaddyalbs/playwright-mcp-toolcalling", data_files="data/test.parquet")['train']

	eval_models = [
	"hf.co/jdaddyalbs/qwen3_sft_playwright_gguf:latest",
	"hf.co/unsloth/Qwen3-4B-GGUF:Q8_0",
	]
	eval_part1_results = 'results.jsonl' # this should be generated before you run this script
	output_file = "eval_results.csv"
	llm_results_file = 'eval_llm_results.parquet'
	# We use an ollama model to grade the answers of each evaluated model
	OLLAMA_API_URL = "http://localhost:11434/api/generate"
	OLLAMA_MODEL_TO_USE = "qwen3:32b" # You can change this to "mistral", "phi3", etc.

	def compare_with_llm(text1: str, text2: str, query: str, model_name: str = "llama3", max_retries: int = 5, initial_delay: int = 1) -> bool:
	"""
	Compares two texts (answer and true_answer) using a locally running Ollama LLM
	to determine if they are semantically equivalent, given a specific query.
	Implements exponential backoff for API call retries.

	Args:
	text1 (str): The 'answer' text to compare.
	text2 (str): The 'true_answer' text to compare.
	query (str): The contextual query to consider during comparison.
	model_name (str): The name of the Ollama model to use (e.g., "llama3", "mistral").
	Ensure this model is pulled and running in Ollama.
	max_retries (int): The maximum number of times to retry the API call.
	initial_delay (int): The initial delay in seconds before the first retry.

	Returns:
	bool: True if the LLM determines the texts are semantically equivalent
	in the context of the query, False otherwise.
	"""
	# The prompt now includes the query for contextual comparison
	prompt = f"""Given the following query, determine if Text 1 and Text 2 are semantically equivalent.
	Consider the context provided by the query when making your decision.
	Ignore minor differences in punctuation, capitalization, or common phrasing unless they significantly change the meaning.

	Respond ONLY with a JSON object containing a single key 'are_same' with a boolean value (true or false).
	Do NOT include any other text or explanation.

	Query: '{query}'
	Text 1: '{text1}'
	Text 2: '{text2}'
	"""

	# Ollama's generate endpoint payload
	payload = {
	"model": model_name,
	"prompt": prompt,
	"stream": False,
	"format": "json"
	}
	headers = {"Content-Type": "application/json"}

	for attempt in range(max_retries):
	try:
	response = requests.post(OLLAMA_API_URL, headers=headers, data=json.dumps(payload))
	response.raise_for_status()

	result = response.json()

	if result and result.get("response"):
	llm_text_response = result["response"]
	try:
	parsed_json = json.loads(llm_text_response)
	return parsed_json.get("are_same", False)
	except json.JSONDecodeError:
	print(f"Warning: Ollama LLM returned non-JSON response in 'response' field: '{llm_text_response}'.")
	return "true" in llm_text_response.lower()

	else:
	print(f"Warning: Unexpected Ollama response structure: {result}")
	return False

	except requests.exceptions.ConnectionError:
	print(f"Connection to Ollama server failed. Is Ollama running at {OLLAMA_API_URL}? Please ensure it's active.")
	if attempt < max_retries - 1:
	delay = initial_delay * (2 ** attempt)
	print(f"Retrying in {delay:.2f} seconds...")
	time.sleep(delay)
	else:
	print(f"Max retries reached. Could not connect to Ollama after {max_retries} attempts.")
	return False
	except requests.exceptions.RequestException as e:
	if attempt < max_retries - 1:
	delay = initial_delay * (2 ** attempt)
	print(f"API request failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.2f} seconds...")
	time.sleep(delay)
	else:
	print(f"API request failed after {max_retries} attempts: {e}")
	return False

	return False

	def apply_llm_comparison_to_dataset(dataset: Dataset, ollama_model: str = "llama3") -> Dataset:
	"""
	Applies the LLM comparison function to the 'answer' and 'true_answer' columns
	of a Hugging Face Dataset, considering a 'query' column, and adds a new 'llm_match' column
	using an Ollama model.

	Args:
	dataset (Dataset): The input Hugging Face Dataset with 'answer', 'true_answer',
	and 'query' columns.
	ollama_model (str): The name of the Ollama model to use.

	Returns:
	Dataset: The Dataset with an additional 'llm_match' column.
	"""
	print(f"Applying Ollama LLM comparison using model '{ollama_model}' to each example in the dataset...")

	def process_example(example):
	example['llm_match'] = compare_with_llm(
	example['answer'],
	example['true_answer'],
	example['query'], # Pass the query to the comparison function
	model_name=ollama_model
	)
	return example

	processed_dataset = dataset.map(process_example)
	print("Ollama LLM comparison applied.")
	return processed_dataset


	df = pd.read_json("results.jsonl",lines=True)
	df['answer'] = df['answer'].apply(lambda x: str(x))
	results = Dataset.from_pandas(df)

	# Apply the LLM comparison function for grading the answers
	# if already ran, load file instead
	#llm_results = Dataset.from_parquet('eval_llm_results.parquet')
	# if you have generated this file already, comment out the next two lines
	llm_results = apply_llm_comparison_to_dataset(results, ollama_model=OLLAMA_MODEL_TO_USE)
	llm_results.to_parquet(llm_results_file)

	# get the samples which were graded as correct answer
	matched = llm_results.filter(lambda x: x['llm_match'])
	used_tools = llm_results.filter(lambda x: x['tool_calls'] > 0)
	matched_with_tools = matched.filter(lambda x: x['tool_calls'] > 0)
	with open(output_file,"w") as outfile:
	outfile.write("model,eval samples,num correct,num with tools,num correct with tools\n")
	for model in eval_models:
	eval_samples = len(llm_results.filter(lambda x: x['model']==model))
	num_correct = len(matched.filter(lambda x: x['model']==model))
	num_w_tools = len(used_tools.filter(lambda x: x['model']==model))
	num_correct_w_tools = len(matched_with_tools.filter(lambda x: x['model']==model))
	outfile.write(f"{model},{eval_samples},{num_correct},{num_w_tools},{num_correct_w_tools}\n")