|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Script d'évaluation pour le modèle n8n Expert. |
|
|
|
|
|
Métriques: |
|
|
1. JSON Validity - Le output est-il du JSON valide? |
|
|
2. Schema Compliance - Le workflow suit-il le schéma n8n? |
|
|
3. Node Accuracy - Les types de nodes sont-ils corrects? |
|
|
4. Connection Logic - Les connexions sont-elles cohérentes? |
|
|
5. Thinking Quality - Le raisonnement est-il présent et structuré? |
|
|
|
|
|
Usage: |
|
|
python eval_n8n_model.py --model stmasson/n8n-expert-14b --samples 100 |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import argparse |
|
|
import re |
|
|
from typing import Dict, List, Any, Tuple |
|
|
from dataclasses import dataclass |
|
|
from tqdm import tqdm |
|
|
import pandas as pd |
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
from huggingface_hub import login |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VALID_NODE_TYPES = { |
|
|
|
|
|
"n8n-nodes-base.webhookTrigger", |
|
|
"n8n-nodes-base.scheduleTrigger", |
|
|
"n8n-nodes-base.manualTrigger", |
|
|
"n8n-nodes-base.emailTrigger", |
|
|
|
|
|
"n8n-nodes-base.httpRequest", |
|
|
"n8n-nodes-base.set", |
|
|
"n8n-nodes-base.if", |
|
|
"n8n-nodes-base.switch", |
|
|
"n8n-nodes-base.merge", |
|
|
"n8n-nodes-base.splitInBatches", |
|
|
"n8n-nodes-base.function", |
|
|
"n8n-nodes-base.code", |
|
|
"n8n-nodes-base.noOp", |
|
|
|
|
|
"n8n-nodes-base.slack", |
|
|
"n8n-nodes-base.gmail", |
|
|
"n8n-nodes-base.googleSheets", |
|
|
"n8n-nodes-base.airtable", |
|
|
"n8n-nodes-base.notion", |
|
|
"n8n-nodes-base.discord", |
|
|
"n8n-nodes-base.telegram", |
|
|
"n8n-nodes-base.openAi", |
|
|
"n8n-nodes-base.postgres", |
|
|
"n8n-nodes-base.mysql", |
|
|
"n8n-nodes-base.mongodb", |
|
|
|
|
|
"@n8n/n8n-nodes-langchain.agent", |
|
|
"@n8n/n8n-nodes-langchain.chainLlm", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EvalResult: |
|
|
"""Résultat d'évaluation pour un exemple""" |
|
|
task_type: str |
|
|
valid_json: bool |
|
|
has_nodes: bool |
|
|
has_connections: bool |
|
|
nodes_valid: bool |
|
|
has_thinking: bool |
|
|
thinking_structured: bool |
|
|
error: str = "" |
|
|
|
|
|
@property |
|
|
def score(self) -> float: |
|
|
"""Score global 0-1""" |
|
|
scores = [ |
|
|
self.valid_json, |
|
|
self.has_nodes, |
|
|
self.has_connections, |
|
|
self.nodes_valid, |
|
|
self.has_thinking, |
|
|
self.thinking_structured, |
|
|
] |
|
|
return sum(scores) / len(scores) |
|
|
|
|
|
|
|
|
def extract_workflow_json(text: str) -> Tuple[str, str]: |
|
|
""" |
|
|
Extrait le JSON du workflow et le thinking de la réponse. |
|
|
Retourne (thinking, workflow_json) |
|
|
""" |
|
|
thinking = "" |
|
|
workflow_json = "" |
|
|
|
|
|
|
|
|
thinking_match = re.search(r'<thinking>(.*?)</thinking>', text, re.DOTALL) |
|
|
if thinking_match: |
|
|
thinking = thinking_match.group(1).strip() |
|
|
|
|
|
|
|
|
|
|
|
json_block = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL) |
|
|
if json_block: |
|
|
workflow_json = json_block.group(1).strip() |
|
|
else: |
|
|
|
|
|
after_thinking = text |
|
|
if thinking_match: |
|
|
after_thinking = text[thinking_match.end():] |
|
|
|
|
|
|
|
|
json_match = re.search(r'\{[\s\S]*\}', after_thinking) |
|
|
if json_match: |
|
|
workflow_json = json_match.group(0).strip() |
|
|
|
|
|
return thinking, workflow_json |
|
|
|
|
|
|
|
|
def validate_workflow(workflow_json: str) -> Dict[str, Any]: |
|
|
"""Valide un workflow n8n""" |
|
|
result = { |
|
|
"valid_json": False, |
|
|
"has_nodes": False, |
|
|
"has_connections": False, |
|
|
"nodes_valid": False, |
|
|
"node_count": 0, |
|
|
"connection_count": 0, |
|
|
"invalid_nodes": [], |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
wf = json.loads(workflow_json) |
|
|
result["valid_json"] = True |
|
|
except json.JSONDecodeError as e: |
|
|
result["error"] = str(e) |
|
|
return result |
|
|
|
|
|
|
|
|
nodes = wf.get("nodes", []) |
|
|
result["has_nodes"] = len(nodes) > 0 |
|
|
result["node_count"] = len(nodes) |
|
|
|
|
|
|
|
|
connections = wf.get("connections", {}) |
|
|
result["has_connections"] = len(connections) > 0 |
|
|
result["connection_count"] = sum(len(v) for v in connections.values()) |
|
|
|
|
|
|
|
|
invalid_nodes = [] |
|
|
for node in nodes: |
|
|
node_type = node.get("type", "") |
|
|
if node_type and node_type not in VALID_NODE_TYPES: |
|
|
|
|
|
if not (node_type.startswith("n8n-nodes-base.") or |
|
|
node_type.startswith("@n8n/")): |
|
|
invalid_nodes.append(node_type) |
|
|
|
|
|
result["invalid_nodes"] = invalid_nodes |
|
|
result["nodes_valid"] = len(invalid_nodes) == 0 |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def validate_thinking(thinking: str) -> Dict[str, bool]: |
|
|
"""Valide la qualité du thinking""" |
|
|
result = { |
|
|
"has_thinking": len(thinking) > 50, |
|
|
"thinking_structured": False, |
|
|
} |
|
|
|
|
|
|
|
|
if thinking: |
|
|
has_structure = ( |
|
|
re.search(r'\d+\.', thinking) is not None or |
|
|
re.search(r'^-\s', thinking, re.MULTILINE) is not None or |
|
|
re.search(r'^\*\s', thinking, re.MULTILINE) is not None or |
|
|
"étape" in thinking.lower() or |
|
|
"step" in thinking.lower() |
|
|
) |
|
|
result["thinking_structured"] = has_structure |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def evaluate_example( |
|
|
model_output: str, |
|
|
task_type: str, |
|
|
) -> EvalResult: |
|
|
"""Évalue un exemple généré par le modèle""" |
|
|
|
|
|
thinking, workflow_json = extract_workflow_json(model_output) |
|
|
|
|
|
|
|
|
wf_validation = validate_workflow(workflow_json) |
|
|
|
|
|
|
|
|
thinking_validation = validate_thinking(thinking) |
|
|
|
|
|
return EvalResult( |
|
|
task_type=task_type, |
|
|
valid_json=wf_validation["valid_json"], |
|
|
has_nodes=wf_validation["has_nodes"], |
|
|
has_connections=wf_validation["has_connections"], |
|
|
nodes_valid=wf_validation["nodes_valid"], |
|
|
has_thinking=thinking_validation["has_thinking"], |
|
|
thinking_structured=thinking_validation["thinking_structured"], |
|
|
error=wf_validation.get("error", ""), |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_evaluation( |
|
|
model_path: str, |
|
|
dataset_repo: str = "stmasson/n8n-agentic-multitask", |
|
|
data_file: str = "data/multitask_large/val.jsonl", |
|
|
num_samples: int = 100, |
|
|
output_file: str = "eval_results.json", |
|
|
): |
|
|
"""Lance l'évaluation complète du modèle""" |
|
|
|
|
|
print("=" * 60) |
|
|
print("ÉVALUATION DU MODÈLE N8N EXPERT") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if hf_token: |
|
|
login(token=hf_token) |
|
|
|
|
|
|
|
|
print(f"\nChargement du modèle: {model_path}") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_path, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True, |
|
|
) |
|
|
|
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
device_map="auto", |
|
|
) |
|
|
|
|
|
|
|
|
print(f"\nChargement du dataset: {dataset_repo}") |
|
|
dataset = load_dataset( |
|
|
dataset_repo, |
|
|
data_files={"validation": data_file}, |
|
|
split="validation" |
|
|
) |
|
|
|
|
|
|
|
|
if num_samples < len(dataset): |
|
|
dataset = dataset.shuffle(seed=42).select(range(num_samples)) |
|
|
|
|
|
print(f"Évaluation sur {len(dataset)} exemples") |
|
|
|
|
|
|
|
|
results = [] |
|
|
task_counts = {} |
|
|
|
|
|
for example in tqdm(dataset, desc="Évaluation"): |
|
|
messages = example["messages"] |
|
|
|
|
|
|
|
|
system_msg = messages[0]["content"] if messages else "" |
|
|
if "génère" in system_msg.lower() or "generate" in system_msg.lower(): |
|
|
task_type = "generate" |
|
|
elif "édite" in system_msg.lower() or "edit" in system_msg.lower(): |
|
|
task_type = "edit" |
|
|
elif "corrige" in system_msg.lower() or "fix" in system_msg.lower(): |
|
|
task_type = "fix" |
|
|
elif "améliore" in system_msg.lower() or "improve" in system_msg.lower(): |
|
|
task_type = "improve" |
|
|
elif "explique" in system_msg.lower() or "explain" in system_msg.lower(): |
|
|
task_type = "explain" |
|
|
elif "débogue" in system_msg.lower() or "debug" in system_msg.lower(): |
|
|
task_type = "debug" |
|
|
else: |
|
|
task_type = "unknown" |
|
|
|
|
|
task_counts[task_type] = task_counts.get(task_type, 0) + 1 |
|
|
|
|
|
|
|
|
prompt = tokenizer.apply_chat_template( |
|
|
messages[:-1], |
|
|
tokenize=False, |
|
|
add_generation_prompt=True, |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
output = pipe( |
|
|
prompt, |
|
|
max_new_tokens=4096, |
|
|
do_sample=False, |
|
|
temperature=None, |
|
|
top_p=None, |
|
|
return_full_text=False, |
|
|
) |
|
|
generated = output[0]["generated_text"] |
|
|
except Exception as e: |
|
|
generated = f"ERROR: {str(e)}" |
|
|
|
|
|
|
|
|
eval_result = evaluate_example(generated, task_type) |
|
|
results.append(eval_result) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("RÉSULTATS") |
|
|
print("=" * 60) |
|
|
|
|
|
total = len(results) |
|
|
|
|
|
|
|
|
metrics = { |
|
|
"valid_json": sum(r.valid_json for r in results) / total, |
|
|
"has_nodes": sum(r.has_nodes for r in results) / total, |
|
|
"has_connections": sum(r.has_connections for r in results) / total, |
|
|
"nodes_valid": sum(r.nodes_valid for r in results) / total, |
|
|
"has_thinking": sum(r.has_thinking for r in results) / total, |
|
|
"thinking_structured": sum(r.thinking_structured for r in results) / total, |
|
|
"overall_score": sum(r.score for r in results) / total, |
|
|
} |
|
|
|
|
|
print("\nMétriques globales:") |
|
|
for metric, value in metrics.items(): |
|
|
print(f" {metric}: {value:.1%}") |
|
|
|
|
|
|
|
|
print("\nMétriques par tâche:") |
|
|
for task_type in sorted(task_counts.keys()): |
|
|
task_results = [r for r in results if r.task_type == task_type] |
|
|
if task_results: |
|
|
task_score = sum(r.score for r in task_results) / len(task_results) |
|
|
task_json = sum(r.valid_json for r in task_results) / len(task_results) |
|
|
print(f" {task_type}: score={task_score:.1%}, json={task_json:.1%} (n={len(task_results)})") |
|
|
|
|
|
|
|
|
output = { |
|
|
"model": model_path, |
|
|
"num_samples": total, |
|
|
"metrics": metrics, |
|
|
"by_task": { |
|
|
task: { |
|
|
"count": len([r for r in results if r.task_type == task]), |
|
|
"score": sum(r.score for r in results if r.task_type == task) / |
|
|
max(1, len([r for r in results if r.task_type == task])), |
|
|
} |
|
|
for task in task_counts.keys() |
|
|
}, |
|
|
} |
|
|
|
|
|
with open(output_file, "w") as f: |
|
|
json.dump(output, f, indent=2) |
|
|
|
|
|
print(f"\nRésultats sauvegardés dans: {output_file}") |
|
|
|
|
|
return metrics |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser(description="Évaluation du modèle n8n Expert") |
|
|
parser.add_argument("--model", type=str, required=True, help="Chemin du modèle à évaluer") |
|
|
parser.add_argument("--samples", type=int, default=100, help="Nombre d'exemples à évaluer") |
|
|
parser.add_argument("--output", type=str, default="eval_results.json", help="Fichier de sortie") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
run_evaluation( |
|
|
model_path=args.model, |
|
|
num_samples=args.samples, |
|
|
output_file=args.output, |
|
|
) |
|
|
|