iwonachristop commited on
Commit
4af6f49
ยท
verified ยท
1 Parent(s): 04191fb

Update pages/evaluate.md

Browse files
Files changed (1) hide show
  1. pages/evaluate.md +108 -4
pages/evaluate.md CHANGED
@@ -1,10 +1,114 @@
1
  # ๐Ÿ”ข Evaluate your model
2
 
3
- To evaluate your model, you can use the code below. Remember to change the `results_path` to the
4
- path to the directory where your outputs are saved. The outputs for each dataset should be saved in
5
- a two-column `TSV` file, where first column is the `file_id`, and the second - `predicted`. Remember
6
- to name each file with the name of the corresponding dataset.
7
 
8
  ```python
 
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ```
 
1
  # ๐Ÿ”ข Evaluate your model
2
 
3
+ To evaluate your model according to the methodology used in our paper, you can use the following code.
 
 
 
4
 
5
  ```python
6
+ import os
7
+ import string
8
 
9
+ from Levenshtein import ratio
10
+ from datasets import load_dataset, Dataset, concatenate_datasets
11
+ from sklearn.metrics import classification_report, f1_score, accuracy_score
12
+
13
+ # ๐Ÿ”ง Change this path to where your JSONL prediction files are stored
14
+ outputs_path = "./"
15
+
16
+ _DATASETS = [
17
+ "cafe", "crema_d", "emns", "emozionalmente", "enterface",
18
+ "jl_Corpus", "mesd", "nemo", "oreau", "pavoque",
19
+ "ravdess", "resd", "subesco",
20
+ ]
21
+
22
+ THRESHOLD = 0.57
23
+
24
+
25
+ def get_expected(split: str) -> tuple[set, str, dict]:
26
+ """Load expected emotion labels and language metadata from CAMEO dataset."""
27
+ ds = load_dataset("amu-cai/CAMEO", split=split)
28
+ return set(ds["emotion"]), ds["language"][0], dict(zip(ds["file_id"], ds["emotion"]))
29
+
30
+
31
+ def process_outputs(dataset_name: str) -> tuple[Dataset, set, str]:
32
+ """Clean and correct predictions, returning a Dataset with fixed predictions."""
33
+ outputs = Dataset.from_json(os.path.join(outputs_path, f"{dataset_name}.jsonl"))
34
+ options, language, expected = get_expected(dataset_name)
35
+
36
+ def preprocess(x):
37
+ return {
38
+ "predicted": x["predicted"].translate(str.maketrans('', '', string.punctuation)).lower().strip(),
39
+ "expected": expected.get(x["file_id"]),
40
+ }
41
+
42
+ outputs = outputs.map(preprocess)
43
+
44
+ def fix_prediction(x):
45
+ if x["predicted"] in options:
46
+ x["fixed_prediction"] = x["predicted"]
47
+ else:
48
+ predicted_words = x["predicted"].split()
49
+ label_scores = {
50
+ label: sum(r for r in (ratio(label, word) for word in predicted_words) if r > THRESHOLD)
51
+ for label in options
52
+ }
53
+ x["fixed_prediction"] = max(label_scores, key=label_scores.get)
54
+ return x
55
+
56
+ outputs = outputs.map(fix_prediction)
57
+ return outputs, options, language
58
+
59
+
60
+ def calculate_metrics(outputs: Dataset, labels: set) -> dict:
61
+ """Compute classification metrics."""
62
+ y_true = outputs["expected"]
63
+ y_pred = outputs["fixed_prediction"]
64
+
65
+ return {
66
+ "f1_macro": f1_score(y_true, y_pred, average="macro"),
67
+ "weighted_f1": f1_score(y_true, y_pred, average="weighted"),
68
+ "accuracy": accuracy_score(y_true, y_pred),
69
+ "metrics_per_label": classification_report(
70
+ y_true, y_pred, target_names=sorted(labels), output_dict=True
71
+ ),
72
+ }
73
+
74
+
75
+ # ๐Ÿงฎ Main Evaluation Loop
76
+ results = []
77
+ outputs_per_language = {}
78
+ full_outputs, full_labels = None, set()
79
+
80
+ for dataset in _DATASETS:
81
+ jsonl_path = os.path.join(outputs_path, f"{dataset}.jsonl")
82
+
83
+ if not os.path.isfile(jsonl_path):
84
+ print(f"Jsonl file for {dataset} not found.")
85
+ continue
86
+
87
+ outputs, labels, language = process_outputs(dataset)
88
+ metrics = calculate_metrics(outputs, labels)
89
+ results.append({"language": language, "dataset": dataset, **metrics})
90
+
91
+ if language not in outputs_per_language:
92
+ outputs_per_language[language] = {"labels": labels, "outputs": outputs}
93
+ else:
94
+ outputs_per_language[language]["labels"] |= labels
95
+ outputs_per_language[language]["outputs"] = concatenate_datasets([
96
+ outputs_per_language[language]["outputs"], outputs
97
+ ])
98
+
99
+ full_outputs = outputs if full_outputs is None else concatenate_datasets([full_outputs, outputs])
100
+ full_labels |= labels
101
+
102
+ # ๐Ÿ”ค Per-language evaluation
103
+ for language, data in outputs_per_language.items():
104
+ metrics = calculate_metrics(data["outputs"], data["labels"])
105
+ results.append({"language": language, "dataset": "all", **metrics})
106
+
107
+ # ๐ŸŒ Global evaluation
108
+ if full_outputs is not None:
109
+ metrics = calculate_metrics(full_outputs, full_labels)
110
+ results.append({"language": "all", "dataset": "all", **metrics})
111
+
112
+ # ๐Ÿ’พ Save results
113
+ Dataset.from_list(results).to_json(os.path.join(outputs_path, "results.jsonl"))
114
  ```