Spaces:
Running
Running
A newer version of the Gradio SDK is available:
5.45.0
๐ข Evaluate your model
To evaluate your model according to the methodology used in our paper, you can use the following code.
import os
import string
from Levenshtein import ratio
from datasets import load_dataset, Dataset, concatenate_datasets
from sklearn.metrics import classification_report, f1_score, accuracy_score
# ๐ง Change this path to where your JSONL prediction files are stored
outputs_path = "./"
_DATASETS = [
"cafe", "crema_d", "emns", "emozionalmente", "enterface",
"jl_Corpus", "mesd", "nemo", "oreau", "pavoque",
"ravdess", "resd", "subesco",
]
THRESHOLD = 0.57
def get_expected(split: str) -> tuple[set, str, dict]:
"""Load expected emotion labels and language metadata from CAMEO dataset."""
ds = load_dataset("amu-cai/CAMEO", split=split)
return set(ds["emotion"]), ds["language"][0], dict(zip(ds["file_id"], ds["emotion"]))
def process_outputs(dataset_name: str) -> tuple[Dataset, set, str]:
"""Clean and correct predictions, returning a Dataset with fixed predictions."""
outputs = Dataset.from_json(os.path.join(outputs_path, f"{dataset_name}.jsonl"))
options, language, expected = get_expected(dataset_name)
def preprocess(x):
return {
"predicted": x["predicted"].translate(str.maketrans('', '', string.punctuation)).lower().strip(),
"expected": expected.get(x["file_id"]),
}
outputs = outputs.map(preprocess)
def fix_prediction(x):
if x["predicted"] in options:
x["fixed_prediction"] = x["predicted"]
else:
predicted_words = x["predicted"].split()
label_scores = {
label: sum(r for r in (ratio(label, word) for word in predicted_words) if r > THRESHOLD)
for label in options
}
x["fixed_prediction"] = max(label_scores, key=label_scores.get)
return x
outputs = outputs.map(fix_prediction)
return outputs, options, language
def calculate_metrics(outputs: Dataset, labels: set) -> dict:
"""Compute classification metrics."""
y_true = outputs["expected"]
y_pred = outputs["fixed_prediction"]
return {
"f1_macro": f1_score(y_true, y_pred, average="macro"),
"weighted_f1": f1_score(y_true, y_pred, average="weighted"),
"accuracy": accuracy_score(y_true, y_pred),
"metrics_per_label": classification_report(
y_true, y_pred, target_names=sorted(labels), output_dict=True
),
}
# ๐งฎ Main Evaluation Loop
results = []
outputs_per_language = {}
full_outputs, full_labels = None, set()
for dataset in _DATASETS:
jsonl_path = os.path.join(outputs_path, f"{dataset}.jsonl")
if not os.path.isfile(jsonl_path):
print(f"Jsonl file for {dataset} not found.")
continue
outputs, labels, language = process_outputs(dataset)
metrics = calculate_metrics(outputs, labels)
results.append({"language": language, "dataset": dataset, **metrics})
if language not in outputs_per_language:
outputs_per_language[language] = {"labels": labels, "outputs": outputs}
else:
outputs_per_language[language]["labels"] |= labels
outputs_per_language[language]["outputs"] = concatenate_datasets([
outputs_per_language[language]["outputs"], outputs
])
full_outputs = outputs if full_outputs is None else concatenate_datasets([full_outputs, outputs])
full_labels |= labels
# ๐ค Per-language evaluation
for language, data in outputs_per_language.items():
metrics = calculate_metrics(data["outputs"], data["labels"])
results.append({"language": language, "dataset": "all", **metrics})
# ๐ Global evaluation
if full_outputs is not None:
metrics = calculate_metrics(full_outputs, full_labels)
results.append({"language": "all", "dataset": "all", **metrics})
# ๐พ Save results
Dataset.from_list(results).to_json(os.path.join(outputs_path, "results.jsonl"))