arunasrivastava commited on
Commit
3a023fb
·
1 Parent(s): 03346c0

without eval library

Browse files
__pycache__/main.cpython-310.pyc CHANGED
Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ
 
__pycache__/phone_metrics.cpython-310.pyc ADDED
Binary file (4.16 kB). View file
 
app.py CHANGED
@@ -147,7 +147,8 @@ with gr.Blocks(css="""
147
  border-bottom: 1px solid #dddddd;
148
  }
149
  """) as demo:
150
- gr.Markdown("# 🎯 Phonemic Transcription Model Evaluation Leaderboard")
 
151
  gr.Markdown("""
152
  ## Explanation of Metrics
153
  - **PER (Phoneme Error Rate)**: The Levenshtein distance calculated between phoneme sequences of the predicted and actual transcriptions.
@@ -158,7 +159,7 @@ with gr.Blocks(css="""
158
  The test set used for evaluation is from the [TIMIT speech corpus](https://www.kaggle.com/datasets/mfekadu/darpa-timit-acousticphonetic-continuous-speech). The TIMIT corpus is a widely used dataset for speech recognition research.
159
 
160
  ## Compute
161
- This leaderboard uses the free basic plan (16GB RAM, 2vCPUs). The evaluation may take several hours to complete. Please be patient and do not submit the same model multiple times.
162
  """)
163
  with gr.Tabs():
164
  with gr.TabItem("🏆 Leaderboard"):
 
147
  border-bottom: 1px solid #dddddd;
148
  }
149
  """) as demo:
150
+ gr.Markdown("# 🎯 Phonemic Transcription Leaderboard")
151
+ gr.Markdown("#### Developed By: Koel Labs")
152
  gr.Markdown("""
153
  ## Explanation of Metrics
154
  - **PER (Phoneme Error Rate)**: The Levenshtein distance calculated between phoneme sequences of the predicted and actual transcriptions.
 
159
  The test set used for evaluation is from the [TIMIT speech corpus](https://www.kaggle.com/datasets/mfekadu/darpa-timit-acousticphonetic-continuous-speech). The TIMIT corpus is a widely used dataset for speech recognition research.
160
 
161
  ## Compute
162
+ This leaderboard uses the free basic plan (16GB RAM, 2vCPUs) to allow for reproducability. The evaluation may take several hours to complete. Please be patient and do not submit the same model multiple times.
163
  """)
164
  with gr.Tabs():
165
  with gr.TabItem("🏆 Leaderboard"):
constants.py DELETED
@@ -1,108 +0,0 @@
1
- from pathlib import Path
2
-
3
- # Directory where request by models are stored
4
- DIR_OUTPUT_REQUESTS = Path("requested_models")
5
- EVAL_REQUESTS_PATH = Path("eval_requests")
6
-
7
- ##########################
8
- # Text definitions #
9
- ##########################
10
-
11
- banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/asr_leaderboard.png"
12
- BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
13
-
14
- TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🤗 IPA Transcription Leaderboard </b> </body> </html>"
15
-
16
- INTRODUCTION_TEXT = "📐 The 🤗 IPA transcription Leaderboard ranks and evaluates speech recognition models \
17
- on the Hugging Face Hub. \
18
- \nWe report the Average [PER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (⬆️ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. \
19
- \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \
20
- \nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
21
-
22
-
23
- METRICS_TAB_TEXT = """
24
- Here you will find details about the speech recognition metrics and datasets reported in our leaderboard.
25
-
26
- ## Metrics
27
-
28
- Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric
29
- is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based
30
- on their WER, lowest to highest.
31
-
32
- Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold:
33
- 1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish.
34
- 2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios).
35
-
36
- For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model).
37
-
38
- ### Word Error Rate (WER)
39
-
40
- Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
41
- of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
42
-
43
- Take the following example:
44
-
45
- | Reference: | the | cat | sat | on | the | mat |
46
- |-------------|-----|-----|---------|-----|-----|-----|
47
- | Prediction: | the | cat | **sit** | on | the | | |
48
- | Label: | ✅ | ✅ | S | ✅ | ✅ | D |
49
-
50
- Here, we have:
51
- * 1 substitution ("sit" instead of "sat")
52
- * 0 insertions
53
- * 1 deletion ("mat" is missing)
54
-
55
- This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
56
- reference (N), which for this example is 6:
57
-
58
- ```
59
- WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
60
- ```
61
-
62
- Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
63
-
64
- ### Inverse Real Time Factor (RTFx)
65
-
66
- Inverse Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
67
- model to process a given amount of speech. It is defined as:
68
- ```
69
- RTFx = (number of seconds of audio inferred) / (compute time in seconds)
70
- ```
71
-
72
- Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
73
- Thus, **a higher RTFx value indicates lower latency**.
74
-
75
- ## How to reproduce our results
76
-
77
- The ASR Leaderboard will be a continued effort to benchmark open source/access speech recognition models where possible.
78
- Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
79
- For more details head over to our repo at: https://github.com/huggingface/open_asr_leaderboard
80
-
81
- P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️
82
-
83
- ## Benchmark datasets
84
-
85
- Evaluating Speech Recognition systems is a hard problem. We use the multi-dataset benchmarking strategy proposed in the
86
- [ESB paper](https://arxiv.org/abs/2210.13352) to obtain robust evaluation scores for each model.
87
-
88
- ESB is a benchmark for evaluating the performance of a single automatic speech recognition (ASR) system across a broad
89
- set of speech datasets. It comprises eight English speech recognition datasets, capturing a broad range of domains,
90
- acoustic conditions, speaker styles, and transcription requirements. As such, it gives a better indication of how
91
- a model is likely to perform on downstream ASR compared to evaluating it on one dataset alone.
92
-
93
- The ESB score is calculated as a macro-average of the WER scores across the ESB datasets. The models in the leaderboard
94
- are ranked based on their average WER scores, from lowest to highest.
95
-
96
- We are currently working to add and curate more datasets. Right now, models will be evaluated just on the TIMIT test dataset for phoneme transcription.
97
-
98
- | Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
99
- |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
100
- | [TIMIT Dataset](https://www.kaggle.com/datasets/mfekadu/darpa-timit-acousticphonetic-continuous-speech) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
101
- For more details on the individual datasets and how models are evaluated to give the ESB score, refer to the [ESB paper](https://arxiv.org/abs/2210.13352).
102
- """
103
-
104
- LEADERBOARD_CSS = """
105
- #leaderboard-table th .header-content {
106
- white-space: nowrap;
107
- }
108
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
img/logo-white.png ADDED
install.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Create a virtual environment with Python 3.10
2
+ python3.10 -m venv venv
3
+
4
+ # Activate the virtual environment
5
+ source venv/bin/activate
6
+
7
+ # Install the required dependencies
8
+ pip install -r requirements.txt
9
+
10
+ # Run the application
11
+ uvicorn app:app --host 0.0.0.0 --port 7860
main.py CHANGED
@@ -14,6 +14,7 @@ import os
14
  from pathlib import Path
15
  from huggingface_hub import HfApi
16
  import evaluate
 
17
 
18
  # Set up download configuration with your token
19
 
@@ -42,7 +43,7 @@ PATHS = {
42
  }
43
 
44
  # Initialize evaluation metric
45
- phone_errors = evaluate.load("ginic/phone_errors")
46
 
47
 
48
  class TimitDataManager:
 
14
  from pathlib import Path
15
  from huggingface_hub import HfApi
16
  import evaluate
17
+ from phone_metrics import PhoneErrorMetrics
18
 
19
  # Set up download configuration with your token
20
 
 
43
  }
44
 
45
  # Initialize evaluation metric
46
+ phone_errors = PhoneErrorMetrics()
47
 
48
 
49
  class TimitDataManager:
phone_metrics.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # phone_metrics.py
2
+
3
+ """
4
+ This module implements phone error metrics based on the work from ginic/phone_errors.
5
+ Original implementation: https://huggingface.co/spaces/ginic/phone_errors
6
+
7
+ Citation:
8
+ @inproceedings{Mortensen-et-al:2016,
9
+ author = {David R. Mortensen and
10
+ Patrick Littell and
11
+ Akash Bharadwaj and
12
+ Kartik Goyal and
13
+ Chris Dyer and
14
+ Lori S. Levin},
15
+ title = {PanPhon: {A} Resource for Mapping {IPA} Segments to Articulatory Feature Vectors},
16
+ booktitle = {Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
17
+ pages = {3475--3484},
18
+ publisher = {{ACL}},
19
+ year = {2016}
20
+ }
21
+ """
22
+
23
+ import numpy as np
24
+ import panphon.distance
25
+ from typing import List, Dict
26
+
27
+ class PhoneErrorMetrics:
28
+ def __init__(self, feature_model: str = "segment"):
29
+ """Initialize the phone error metrics calculator.
30
+
31
+ Args:
32
+ feature_model (str): panphon feature parsing model ("strict", "permissive", or "segment")
33
+ """
34
+ self.distance_computer = panphon.distance.Distance(feature_model=feature_model)
35
+
36
+ def _phone_error_rate(self, prediction: str, reference: str) -> float:
37
+ """Compute phone error rate between prediction and reference.
38
+
39
+ Args:
40
+ prediction (str): Predicted IPA string
41
+ reference (str): Reference IPA string
42
+
43
+ Returns:
44
+ float: Phone error rate
45
+ """
46
+ if not reference:
47
+ raise ValueError("Reference string cannot be empty")
48
+
49
+ pred_phones = self.distance_computer.fm.ipa_segs(prediction)
50
+ ref_phones = self.distance_computer.fm.ipa_segs(reference)
51
+
52
+ phone_edits = self.distance_computer.min_edit_distance(
53
+ lambda x: 1, # deletion cost
54
+ lambda x: 1, # insertion cost
55
+ lambda x, y: 0 if x == y else 1, # substitution cost
56
+ [[]],
57
+ pred_phones,
58
+ ref_phones
59
+ )
60
+
61
+ return phone_edits / len(ref_phones)
62
+
63
+ def compute(self,
64
+ predictions: List[str],
65
+ references: List[str],
66
+ is_normalize_pfer: bool = False) -> Dict:
67
+ """Compute phone error metrics between predictions and references.
68
+
69
+ Args:
70
+ predictions (List[str]): List of predicted IPA strings
71
+ references (List[str]): List of reference IPA strings
72
+ is_normalize_pfer (bool): Whether to normalize phone feature error rates
73
+
74
+ Returns:
75
+ Dict containing:
76
+ - phone_error_rates: List of PER for each pair
77
+ - mean_phone_error_rate: Average PER
78
+ - phone_feature_error_rates: List of PFER for each pair
79
+ - mean_phone_feature_error_rate: Average PFER
80
+ - feature_error_rates: List of FER for each pair
81
+ - mean_feature_error_rate: Average FER
82
+ """
83
+ phone_error_rates = []
84
+ feature_error_rates = []
85
+ hamming_distances = []
86
+
87
+ for pred, ref in zip(predictions, references):
88
+ if is_normalize_pfer:
89
+ hd = self.distance_computer.hamming_feature_edit_distance_div_maxlen(pred, ref)
90
+ else:
91
+ hd = self.distance_computer.hamming_feature_edit_distance(pred, ref)
92
+
93
+ hamming_distances.append(hd)
94
+ per = self._phone_error_rate(pred, ref)
95
+ phone_error_rates.append(per)
96
+ fer = self.distance_computer.feature_error_rate(pred, ref)
97
+ feature_error_rates.append(fer)
98
+
99
+ return {
100
+ "phone_error_rates": phone_error_rates,
101
+ "mean_phone_error_rate": float(np.mean(phone_error_rates)),
102
+ "phone_feature_error_rates": hamming_distances,
103
+ "mean_phone_feature_error_rate": float(np.mean(hamming_distances)),
104
+ "feature_error_rates": feature_error_rates,
105
+ "mean_feature_error_rate": float(np.mean(feature_error_rates))
106
+ }
queue/leaderboard.json CHANGED
@@ -98,5 +98,85 @@
98
  "subset": "test",
99
  "github_url": "https://huggingface.co/ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
100
  "submission_date": "2024-12-18T23:29:27.322286"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  ]
 
98
  "subset": "test",
99
  "github_url": "https://huggingface.co/ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
100
  "submission_date": "2024-12-18T23:29:27.322286"
101
+ },
102
+ {
103
+ "submission_id": "d0b2f8b4-20f8-45b4-b1a5-c81390d75b29",
104
+ "submission_name": "wav2vec2 non-english transcription",
105
+ "model": "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
106
+ "average_per": 0.6417205190285036,
107
+ "average_pwed": 0.19048963968896404,
108
+ "subset": "test",
109
+ "github_url": "https://huggingface.co/ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
110
+ "submission_date": "2024-12-19T07:41:18.135985"
111
+ },
112
+ {
113
+ "submission_id": "3bbb0f03-31a5-45b0-bde3-bbf574f19983",
114
+ "submission_name": "phonetic transcription with the Buckeye corpus, from xlsr-53 model",
115
+ "model": "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-buckeye-ipa",
116
+ "average_per": 0.2810165988557621,
117
+ "average_pwed": 0.10703377161801164,
118
+ "subset": "test",
119
+ "github_url": "https://github.com/ginic/multipa/tree/buckeye_experiments",
120
+ "submission_date": "2024-12-20T13:45:52.010575"
121
+ },
122
+ {
123
+ "submission_id": "2ed095f7-4712-4539-87b6-1e8588ac92a3",
124
+ "submission_name": "phonetic transcription",
125
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-INTERNATIONAL1.9.2WithoutSpaces",
126
+ "average_per": 0.9537775908999574,
127
+ "average_pwed": 0.9351204819224959,
128
+ "subset": "test",
129
+ "github_url": "https://huggingface.co/Jubliano/wav2vec2-large-xls-r-300m-ipa-INTERNATIONAL1.5WithoutSpaces",
130
+ "submission_date": "2024-12-20T14:21:32.293694"
131
+ },
132
+ {
133
+ "submission_id": "9cf02ce8-fc43-4d23-a8bb-b44e3116a93c",
134
+ "submission_name": "Jubliano xlsr model",
135
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-nl",
136
+ "average_per": 0.9887075544197294,
137
+ "average_pwed": 0.9692486915717254,
138
+ "subset": "test",
139
+ "github_url": "https://huggingface.co/Jubliano/wav2vec2-large-xls-r-300m-ipa-nl1.1",
140
+ "submission_date": "2024-12-20T15:40:51.632895"
141
+ },
142
+ {
143
+ "submission_id": "d5013845-f5c9-428a-8b39-7db066bb9f05",
144
+ "submission_name": "speech31 phoneme transcription english",
145
+ "model": "speech31/wavlm-large-english-ipa",
146
+ "average_per": 0.3694017596969614,
147
+ "average_pwed": 0.1356824900612308,
148
+ "subset": "test",
149
+ "github_url": "https://huggingface.co/speech31/wavlm-large-english-ipa",
150
+ "submission_date": "2024-12-20T16:26:47.982209"
151
+ },
152
+ {
153
+ "submission_id": "362c788d-bc2e-427d-8c74-105f6235cf62",
154
+ "submission_name": "speech31 xlsr model",
155
+ "model": "speech31/XLS-R-300m-english-ipa",
156
+ "average_per": 0.36382554692045954,
157
+ "average_pwed": 0.1299702312124616,
158
+ "subset": "test",
159
+ "github_url": "https://huggingface.co/speech31/XLS-R-300m-english-ipa",
160
+ "submission_date": "2024-12-20T16:47:54.826509"
161
+ },
162
+ {
163
+ "submission_id": "49e22782-0af1-4313-bc0c-60cb2f28d78f",
164
+ "submission_name": "model is a fine-tuned version of facebook/wav2vec2-large on the TIMIT dataset",
165
+ "model": "speech31/wav2vec2-large-english-TIMIT-phoneme_v3",
166
+ "average_per": 0.44563344149564776,
167
+ "average_pwed": 0.18844914029048124,
168
+ "subset": "test",
169
+ "github_url": "https://huggingface.co/speech31/wav2vec2-large-english-TIMIT-phoneme_v3",
170
+ "submission_date": "2024-12-20T17:05:35.213738"
171
+ },
172
+ {
173
+ "submission_id": "26c04108-1131-435c-95f1-bb56b2aff06c",
174
+ "submission_name": "fine-tuned version of facebook/wav2vec2-large on the None dataset",
175
+ "model": "speech31/wav2vec2-large-TIMIT-IPA2",
176
+ "average_per": 0.4847029843149011,
177
+ "average_pwed": 0.2072006544586948,
178
+ "subset": "test",
179
+ "github_url": "https://huggingface.co/speech31/wav2vec2-large-TIMIT-IPA2",
180
+ "submission_date": "2024-12-20T22:50:50.645178"
181
  }
182
  ]
queue/results.json CHANGED
@@ -504,5 +504,465 @@
504
  }
505
  ],
506
  "timestamp": "2024-12-18T23:29:27.320433"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  }
508
  ]
 
504
  }
505
  ],
506
  "timestamp": "2024-12-18T23:29:27.320433"
507
+ },
508
+ {
509
+ "task_id": "59afc37a-0072-44dd-a02a-0cf47d89c120",
510
+ "model": "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
511
+ "subset": "test",
512
+ "num_files": 1680,
513
+ "average_per": 0.6417205190285036,
514
+ "average_pwed": 0.19048963968896404,
515
+ "detailed_results": [
516
+ {
517
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
518
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
519
+ "prediction": "ʂiharjoɖarksɯudenɡwisiwaːʂwarɔːjiːr",
520
+ "per": 0.696969696969697,
521
+ "pwed": 0.20580808080808083
522
+ },
523
+ {
524
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
525
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
526
+ "prediction": "dɔndaːskmidɨkaːɻjɑno̞jwɯräːɡläikθaːn",
527
+ "per": 0.8214285714285714,
528
+ "pwed": 0.17338709677419356
529
+ },
530
+ {
531
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
532
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
533
+ "prediction": "çizkatːɛnwɔstinanhaːɡɛɾdanɨzbirufubuswɔwoːɾnenʂaːbi",
534
+ "per": 0.5531914893617021,
535
+ "pwed": 0.1276595744680851
536
+ },
537
+ {
538
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
539
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
540
+ "prediction": "ðɔriːzɔnsfɾdɔɕtaːivsimtfuøʃnɛu",
541
+ "per": 0.5862068965517241,
542
+ "pwed": 0.08764367816091957
543
+ },
544
+ {
545
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
546
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
547
+ "prediction": "pɾɔdakʂɔnmɛjfaɔfaɾbuwɔuwɛkspɛktajʂons",
548
+ "per": 0.7575757575757576,
549
+ "pwed": 0.18806306306306303
550
+ }
551
+ ],
552
+ "timestamp": "2024-12-19T07:41:18.132953"
553
+ },
554
+ {
555
+ "task_id": "5517f6b2-6a76-4a2d-a6ce-33446f390c3b",
556
+ "model": "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-buckeye-ipa",
557
+ "subset": "test",
558
+ "num_files": 1680,
559
+ "average_per": 0.2810165988557621,
560
+ "average_pwed": 0.10703377161801164,
561
+ "detailed_results": [
562
+ {
563
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
564
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
565
+ "prediction": "ʃihædjɹ̩dɑɹksudɪnɡɹisiwɑʃwɑɾɹ̩ɔljiɹ",
566
+ "per": 0.18181818181818182,
567
+ "pwed": 0.07196969696969698
568
+ },
569
+ {
570
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
571
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
572
+ "prediction": "doʊndæskmitɪkæɹiʌnɔɪliɹæɡlaɪkðæʔ",
573
+ "per": 0.2857142857142857,
574
+ "pwed": 0.14062500000000003
575
+ },
576
+ {
577
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
578
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
579
+ "prediction": "hɪzkæptʌnwʌzθɪnhæɡɹ̩dɛnɪzbjuɾʌfl̩butswɹ̩wɔʊɹnɪnʃæbi",
580
+ "per": 0.2978723404255319,
581
+ "pwed": 0.09114583333333333
582
+ },
583
+ {
584
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
585
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
586
+ "prediction": "ðʌɹizʌnzfɹ̩ðʌstaɪvsimtfulɪʃnaʊ",
587
+ "per": 0.2413793103448276,
588
+ "pwed": 0.014367816091954023
589
+ },
590
+ {
591
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
592
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
593
+ "prediction": "pɹʌdʌkʃʌnmeɪfɔlfɑɹbʌloʊɛkspɛkteɪʃʌnz",
594
+ "per": 0.30303030303030304,
595
+ "pwed": 0.10532407407407407
596
+ }
597
+ ],
598
+ "timestamp": "2024-12-20T13:45:52.009233"
599
+ },
600
+ {
601
+ "task_id": "c2139f96-e79e-4f25-a525-aa039f65555f",
602
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-INTERNATIONAL1.9.2WithoutSpaces",
603
+ "subset": "test",
604
+ "num_files": 1680,
605
+ "average_per": 0.9537775908999574,
606
+ "average_pwed": 0.9351204819224959,
607
+ "detailed_results": [
608
+ {
609
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
610
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
611
+ "prediction": "iɛ2",
612
+ "per": 0.9696969696969697,
613
+ "pwed": 0.9406565656565656
614
+ },
615
+ {
616
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
617
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
618
+ "prediction": "iɛ2",
619
+ "per": 0.9285714285714286,
620
+ "pwed": 0.9285714285714286
621
+ },
622
+ {
623
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
624
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
625
+ "prediction": "iɛ2",
626
+ "per": 0.9787234042553191,
627
+ "pwed": 0.9583333333333333
628
+ },
629
+ {
630
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
631
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
632
+ "prediction": "iɛ2",
633
+ "per": 0.9655172413793104,
634
+ "pwed": 0.932471264367816
635
+ },
636
+ {
637
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
638
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
639
+ "prediction": "iɛ2",
640
+ "per": 0.9696969696969697,
641
+ "pwed": 0.9406565656565656
642
+ }
643
+ ],
644
+ "timestamp": "2024-12-20T14:21:32.290889"
645
+ },
646
+ {
647
+ "task_id": "d146f1f1-6e6e-4b28-9420-c652ae9a1002",
648
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-nl",
649
+ "subset": "test",
650
+ "num_files": 1680,
651
+ "average_per": 0.9887075544197294,
652
+ "average_pwed": 0.9692486915717254,
653
+ "detailed_results": [
654
+ {
655
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
656
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
657
+ "prediction": "p",
658
+ "per": 1.0,
659
+ "pwed": 0.9747474747474747
660
+ },
661
+ {
662
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
663
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
664
+ "prediction": "p",
665
+ "per": 1.0,
666
+ "pwed": 0.96875
667
+ },
668
+ {
669
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
670
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
671
+ "prediction": "p",
672
+ "per": 0.9787234042553191,
673
+ "pwed": 0.9787234042553191
674
+ },
675
+ {
676
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
677
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
678
+ "prediction": "p",
679
+ "per": 1.0,
680
+ "pwed": 0.9683908045977011
681
+ },
682
+ {
683
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
684
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
685
+ "prediction": "p",
686
+ "per": 0.9696969696969697,
687
+ "pwed": 0.9696969696969697
688
+ }
689
+ ],
690
+ "timestamp": "2024-12-20T15:26:27.658798"
691
+ },
692
+ {
693
+ "task_id": "265c5859-e7ba-492d-a6c9-45733dc17c99",
694
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-nl",
695
+ "subset": "test",
696
+ "num_files": 1680,
697
+ "average_per": 0.9887075544197294,
698
+ "average_pwed": 0.9692486915717254,
699
+ "detailed_results": [
700
+ {
701
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
702
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
703
+ "prediction": "p",
704
+ "per": 1.0,
705
+ "pwed": 0.9747474747474747
706
+ },
707
+ {
708
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
709
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
710
+ "prediction": "p",
711
+ "per": 1.0,
712
+ "pwed": 0.96875
713
+ },
714
+ {
715
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
716
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
717
+ "prediction": "p",
718
+ "per": 0.9787234042553191,
719
+ "pwed": 0.9787234042553191
720
+ },
721
+ {
722
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
723
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
724
+ "prediction": "p",
725
+ "per": 1.0,
726
+ "pwed": 0.9683908045977011
727
+ },
728
+ {
729
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
730
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
731
+ "prediction": "p",
732
+ "per": 0.9696969696969697,
733
+ "pwed": 0.9696969696969697
734
+ }
735
+ ],
736
+ "timestamp": "2024-12-20T15:40:51.631218"
737
+ },
738
+ {
739
+ "task_id": "e297dfde-95e5-462b-a6e5-8fa43bc30bc0",
740
+ "model": "speech31/wavlm-large-english-ipa",
741
+ "subset": "test",
742
+ "num_files": 1680,
743
+ "average_per": 0.3694017596969614,
744
+ "average_pwed": 0.1356824900612308,
745
+ "detailed_results": [
746
+ {
747
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
748
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
749
+ "prediction": "ʃihædjɔɹdɑɹksutɪnɡɹisiwɑʃwɔtɹ̩ɔljɪɹ",
750
+ "per": 0.2727272727272727,
751
+ "pwed": 0.11274509803921567
752
+ },
753
+ {
754
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
755
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
756
+ "prediction": "dɑntæskmitəkæɹiænojliɹæɡlajkðæt",
757
+ "per": 0.39285714285714285,
758
+ "pwed": 0.13575268817204303
759
+ },
760
+ {
761
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
762
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
763
+ "prediction": "hɪzkæpptənwɑzθɪændhæɡɹ̩dænhɪzbjutəfəlbutswɹ̩wɔɹnɪnʃæbi",
764
+ "per": 0.3404255319148936,
765
+ "pwed": 0.12980769230769232
766
+ },
767
+ {
768
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
769
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
770
+ "prediction": "ðəɹizənzfɔɹðəsdajvsimdfulɪʃnaw",
771
+ "per": 0.20689655172413793,
772
+ "pwed": 0.051388888888888894
773
+ },
774
+ {
775
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
776
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
777
+ "prediction": "pɹədʌkʃənmejffɔlfɔɑɹbɪlowɪkspɛktejʃənz",
778
+ "per": 0.45454545454545453,
779
+ "pwed": 0.16666666666666666
780
+ }
781
+ ],
782
+ "timestamp": "2024-12-20T16:13:24.050232"
783
+ },
784
+ {
785
+ "task_id": "efe95f71-05e3-485d-8e0c-1823a3037cf4",
786
+ "model": "speech31/wavlm-large-english-ipa",
787
+ "subset": "test",
788
+ "num_files": 1680,
789
+ "average_per": 0.3694017596969614,
790
+ "average_pwed": 0.1356824900612308,
791
+ "detailed_results": [
792
+ {
793
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
794
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
795
+ "prediction": "ʃihædjɔɹdɑɹksutɪnɡɹisiwɑʃwɔtɹ̩ɔljɪɹ",
796
+ "per": 0.2727272727272727,
797
+ "pwed": 0.11274509803921567
798
+ },
799
+ {
800
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
801
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
802
+ "prediction": "dɑntæskmitəkæɹiænojliɹæɡlajkðæt",
803
+ "per": 0.39285714285714285,
804
+ "pwed": 0.13575268817204303
805
+ },
806
+ {
807
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
808
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
809
+ "prediction": "hɪzkæpptənwɑzθɪændhæɡɹ̩dænhɪzbjutəfəlbutswɹ̩wɔɹnɪnʃæbi",
810
+ "per": 0.3404255319148936,
811
+ "pwed": 0.12980769230769232
812
+ },
813
+ {
814
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
815
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
816
+ "prediction": "ðəɹizənzfɔɹðəsdajvsimdfulɪʃnaw",
817
+ "per": 0.20689655172413793,
818
+ "pwed": 0.051388888888888894
819
+ },
820
+ {
821
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
822
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
823
+ "prediction": "pɹədʌkʃənmejffɔlfɔɑɹbɪlowɪkspɛktejʃənz",
824
+ "per": 0.45454545454545453,
825
+ "pwed": 0.16666666666666666
826
+ }
827
+ ],
828
+ "timestamp": "2024-12-20T16:26:47.980084"
829
+ },
830
+ {
831
+ "task_id": "4b2ae2fc-fe2f-4f8b-9e8f-25c0bae13c0d",
832
+ "model": "speech31/XLS-R-300m-english-ipa",
833
+ "subset": "test",
834
+ "num_files": 1680,
835
+ "average_per": 0.36382554692045954,
836
+ "average_pwed": 0.1299702312124616,
837
+ "detailed_results": [
838
+ {
839
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
840
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
841
+ "prediction": "ʃihædjɔɹdɑɹksutɪnɡɹisiwɑʃwɔtɹ̩ɔljɪɹ",
842
+ "per": 0.2727272727272727,
843
+ "pwed": 0.11274509803921567
844
+ },
845
+ {
846
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
847
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
848
+ "prediction": "dɑntæskmitəkæɹiænojliɹæɡlajkðæt",
849
+ "per": 0.39285714285714285,
850
+ "pwed": 0.13575268817204303
851
+ },
852
+ {
853
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
854
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
855
+ "prediction": "hɪzkæmptənwɑzθɪnændhæɡɹ̩dɪndhɪzbjutəfəlbutswɹ̩wɔɹnɪnʃæbi",
856
+ "per": 0.3404255319148936,
857
+ "pwed": 0.14583333333333334
858
+ },
859
+ {
860
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
861
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
862
+ "prediction": "ðəɹɛzənzfɔɹðɪstajvsimdfulɪʃnaw",
863
+ "per": 0.2413793103448276,
864
+ "pwed": 0.052777777777777785
865
+ },
866
+ {
867
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
868
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
869
+ "prediction": "pɹədʌkʃənmejfɔlfɑɹbɪlowɛkspɛktejʃənz",
870
+ "per": 0.3939393939393939,
871
+ "pwed": 0.11921296296296297
872
+ }
873
+ ],
874
+ "timestamp": "2024-12-20T16:47:54.824174"
875
+ },
876
+ {
877
+ "task_id": "33d387c0-703c-415d-b8e2-81cea87a2146",
878
+ "model": "speech31/wav2vec2-large-english-TIMIT-phoneme_v3",
879
+ "subset": "test",
880
+ "num_files": 1680,
881
+ "average_per": 0.44563344149564776,
882
+ "average_pwed": 0.18844914029048124,
883
+ "detailed_results": [
884
+ {
885
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
886
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
887
+ "prediction": "ʃihædjʊrdɑrksutɪngrisiwɑʃwɔtərɔljɪrr",
888
+ "per": 0.3939393939393939,
889
+ "pwed": 0.12976190476190474
890
+ },
891
+ {
892
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
893
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
894
+ "prediction": "doʊntæskmitɪkɛriənɔɪliræglaɪkðətdnt",
895
+ "per": 0.39285714285714285,
896
+ "pwed": 0.19730392156862747
897
+ },
898
+ {
899
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
900
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
901
+ "prediction": "hɪzkæptənwɑzθɪnəndhægərdəndhɪzbjutəfəlbutswərwɔrnɪnʃæbibæb",
902
+ "per": 0.44680851063829785,
903
+ "pwed": 0.20394736842105265
904
+ },
905
+ {
906
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
907
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
908
+ "prediction": "ðərizənzfərðɪsstaɪvsimdfulɪʃnaʊa",
909
+ "per": 0.27586206896551724,
910
+ "pwed": 0.11328125
911
+ },
912
+ {
913
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
914
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
915
+ "prediction": "prədəkʃənmeɪfɔlfɑrbɪloʊɛkspɛkteɪʃənzd",
916
+ "per": 0.3939393939393939,
917
+ "pwed": 0.13626126126126126
918
+ }
919
+ ],
920
+ "timestamp": "2024-12-20T17:05:35.210786"
921
+ },
922
+ {
923
+ "task_id": "c89bcefc-3884-435a-a54c-24297fe6f041",
924
+ "model": "speech31/wav2vec2-large-TIMIT-IPA2",
925
+ "subset": "test",
926
+ "num_files": 1680,
927
+ "average_per": 0.4847029843149011,
928
+ "average_pwed": 0.2072006544586948,
929
+ "detailed_results": [
930
+ {
931
+ "file": "data/TEST/DR1/FAKS0/SA1.WAV",
932
+ "ground_truth": "ʃihædjɹdɑɹksuɾɪŋgɹisiwɑʃwɑɾɹʔɔljiɹ",
933
+ "prediction": "ʃihædjʊrdɑrksutɪngrisiwɑʃwɔtərɔljɪrər",
934
+ "per": 0.42424242424242425,
935
+ "pwed": 0.15393518518518517
936
+ },
937
+ {
938
+ "file": "data/TEST/DR1/FAKS0/SA2.WAV",
939
+ "ground_truth": "oʊnæsmitikɛɹiinɔɪliɹæglaɪkðæt",
940
+ "prediction": "doʊntæskmitɪkɛriənɔɪliræglaɪkðətdoʊndt",
941
+ "per": 0.5,
942
+ "pwed": 0.2623873873873874
943
+ },
944
+ {
945
+ "file": "data/TEST/DR1/FAKS0/SI1573.WAV",
946
+ "ground_truth": "hɪzkæpinwəsθɪnænhægɹdinɪzbjuɾuflbutswɹwɔɹninʃæbi",
947
+ "prediction": "hɪzkæptənwɑzθɪnəndhægərdəndhɪzbjutəfəlbutswərwɔrnəndʃæbiiii",
948
+ "per": 0.46808510638297873,
949
+ "pwed": 0.2191091954022989
950
+ },
951
+ {
952
+ "file": "data/TEST/DR1/FAKS0/SI2203.WAV",
953
+ "ground_truth": "ðiɹizənzfɹðɪsdaɪvsimdfuliʃnaʊ",
954
+ "prediction": "ðərizənzfərðɪstaɪvsimdfulɪʃnaʊ",
955
+ "per": 0.20689655172413793,
956
+ "pwed": 0.054166666666666675
957
+ },
958
+ {
959
+ "file": "data/TEST/DR1/FAKS0/SI943.WAV",
960
+ "ground_truth": "ɹdʌkʃinmeɪfɔlfɑɹbəloʊəkspikeɪʃnts",
961
+ "prediction": "prədəkʃənmeɪfɔlfɑrbɪloʊɛkspɛkteɪʃənzpzppppzpdtdtd",
962
+ "per": 0.7272727272727273,
963
+ "pwed": 0.34438775510204084
964
+ }
965
+ ],
966
+ "timestamp": "2024-12-20T22:50:50.641790"
967
  }
968
  ]
queue/tasks.json CHANGED
@@ -124,5 +124,134 @@
124
  "github_url": "https://huggingface.co/ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
125
  "status": "completed",
126
  "submitted_at": "2024-12-18T22:55:36.734691"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  }
128
  ]
 
124
  "github_url": "https://huggingface.co/ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
125
  "status": "completed",
126
  "submitted_at": "2024-12-18T22:55:36.734691"
127
+ },
128
+ {
129
+ "id": "59afc37a-0072-44dd-a02a-0cf47d89c120",
130
+ "model": "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
131
+ "subset": "test",
132
+ "submission_name": "wav2vec2 non-english transcription",
133
+ "github_url": "https://huggingface.co/ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
134
+ "status": "completed",
135
+ "submitted_at": "2024-12-18T23:47:03.488337"
136
+ },
137
+ {
138
+ "id": "e57eda9d-7a1d-4b41-9d47-a3d3839cac8b",
139
+ "model": "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-buckeye-ipa",
140
+ "subset": "test",
141
+ "submission_name": "phonetic transcription with the Buckeye corpus, from xlsr-53 model ",
142
+ "github_url": "https://github.com/ginic/multipa/tree/buckeye_experiments",
143
+ "status": "failed",
144
+ "submitted_at": "2024-12-19T11:48:26.415322",
145
+ "error": "Evaluation failed: (MaxRetryError(\"HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Max retries exceeded with url: /repos/a4/b1/a4b11f4627350048e021a84d10b89320db54e02c54b2a9366228f8a05cda220b/120f5bc04d1df15143033c93e3ef358981775b529f17e0db11e58a1b80754e67?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1734889736&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNDg4OTczNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2E0L2IxL2E0YjExZjQ2MjczNTAwNDhlMDIxYTg0ZDEwYjg5MzIwZGI1NGUwMmM1NGIyYTkzNjYyMjhmOGEwNWNkYTIyMGIvMTIwZjViYzA0ZDFkZjE1MTQzMDMzYzkzZTNlZjM1ODk4MTc3NWI1MjlmMTdlMGRiMTFlNThhMWI4MDc1NGU2Nz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=kfPD6ymEJuVvFZyuN3qL3xk4YJlpI5dqHgON4wJY-Mppwlp6x4Dw7cWdjEkJvMRF-bDuzNWQ3BEJPbsYouVW9WZMucDmxo38UwxSzIBhfWQxCYiHdUWuQPkypDUkI1mR3vbnCFQFXLiMQ2CgwWQz7q66OjIyq3suA00mhL2WcL8wvtovrfoEOkboEXCHCNLprfpoHpfoyfo~VS9~kmm61GN6SWbc9lzASIuT5FLkn~BJ6h405MgutQpNvrR4SHVLftk7rBmY8TAB3re5D0-9qFrMYb2Tk~9RKT3nxSNbgZVcEXzA5rYskcuGsrHoTuTTZ-NSW69K2M0IeivzFWTLNQ__&Key-Pair-Id=K24J24Z295AEI9 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x280544190>: Failed to establish a new connection: [Errno 51] Network is unreachable'))\"), '(Request ID: 14c9cc7c-47ee-47ae-b473-f4add807d233)')"
146
+ },
147
+ {
148
+ "id": "5517f6b2-6a76-4a2d-a6ce-33446f390c3b",
149
+ "model": "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-buckeye-ipa",
150
+ "subset": "test",
151
+ "submission_name": "phonetic transcription with the Buckeye corpus, from xlsr-53 model",
152
+ "github_url": "https://github.com/ginic/multipa/tree/buckeye_experiments",
153
+ "status": "completed",
154
+ "submitted_at": "2024-12-20T13:29:37.327317"
155
+ },
156
+ {
157
+ "id": "c2139f96-e79e-4f25-a525-aa039f65555f",
158
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-INTERNATIONAL1.9.2WithoutSpaces",
159
+ "subset": "test",
160
+ "submission_name": "phonetic transcription",
161
+ "github_url": "https://huggingface.co/Jubliano/wav2vec2-large-xls-r-300m-ipa-INTERNATIONAL1.5WithoutSpaces",
162
+ "status": "completed",
163
+ "submitted_at": "2024-12-20T14:01:35.626112"
164
+ },
165
+ {
166
+ "id": "d146f1f1-6e6e-4b28-9420-c652ae9a1002",
167
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-nl",
168
+ "subset": "test",
169
+ "submission_name": "Jubliano xlsr model",
170
+ "github_url": "https://huggingface.co/Jubliano/wav2vec2-large-xls-r-300m-ipa-nl1.1",
171
+ "status": "completed",
172
+ "submitted_at": "2024-12-20T15:08:45.949389"
173
+ },
174
+ {
175
+ "id": "265c5859-e7ba-492d-a6c9-45733dc17c99",
176
+ "model": "Jubliano/wav2vec2-large-xls-r-300m-ipa-nl",
177
+ "subset": "test",
178
+ "submission_name": "Jubliano xlsr model",
179
+ "github_url": "https://huggingface.co/Jubliano/wav2vec2-large-xls-r-300m-ipa-nl1.1",
180
+ "status": "completed",
181
+ "submitted_at": "2024-12-20T15:26:27.706187"
182
+ },
183
+ {
184
+ "id": "e297dfde-95e5-462b-a6e5-8fa43bc30bc0",
185
+ "model": "speech31/wavlm-large-english-ipa",
186
+ "subset": "test",
187
+ "submission_name": "speech31 phoneme transcription english",
188
+ "github_url": "https://huggingface.co/speech31/wavlm-large-english-ipa",
189
+ "status": "completed",
190
+ "submitted_at": "2024-12-20T15:56:25.445806"
191
+ },
192
+ {
193
+ "id": "efe95f71-05e3-485d-8e0c-1823a3037cf4",
194
+ "model": "speech31/wavlm-large-english-ipa",
195
+ "subset": "test",
196
+ "submission_name": "speech31 phoneme transcription english",
197
+ "github_url": "https://huggingface.co/speech31/wavlm-large-english-ipa",
198
+ "status": "completed",
199
+ "submitted_at": "2024-12-20T16:13:24.099308"
200
+ },
201
+ {
202
+ "id": "4b2ae2fc-fe2f-4f8b-9e8f-25c0bae13c0d",
203
+ "model": "speech31/XLS-R-300m-english-ipa",
204
+ "subset": "test",
205
+ "submission_name": "speech31 xlsr model",
206
+ "github_url": "https://huggingface.co/speech31/XLS-R-300m-english-ipa",
207
+ "status": "completed",
208
+ "submitted_at": "2024-12-20T16:33:23.864360"
209
+ },
210
+ {
211
+ "id": "33d387c0-703c-415d-b8e2-81cea87a2146",
212
+ "model": "speech31/wav2vec2-large-english-TIMIT-phoneme_v3",
213
+ "subset": "test",
214
+ "submission_name": "model is a fine-tuned version of facebook/wav2vec2-large on the TIMIT dataset",
215
+ "github_url": "https://huggingface.co/speech31/wav2vec2-large-english-TIMIT-phoneme_v3",
216
+ "status": "completed",
217
+ "submitted_at": "2024-12-20T16:52:07.883839"
218
+ },
219
+ {
220
+ "id": "03e4e265-dc1c-4052-88bd-4ef481938d9d",
221
+ "model": "speech31/wav2vec2-large-TIMIT-IPA2\"",
222
+ "subset": "test",
223
+ "submission_name": "fine-tuned version of facebook/wav2vec2-large on the None dataset",
224
+ "github_url": "https://huggingface.co/speech31/wav2vec2-large-TIMIT-IPA2",
225
+ "status": "failed",
226
+ "submitted_at": "2024-12-20T21:54:21.539246",
227
+ "error": "Evaluation failed: Incorrect path_or_model_id: 'speech31/wav2vec2-large-TIMIT-IPA2\"'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
228
+ },
229
+ {
230
+ "id": "4575245b-ae3c-4969-adef-fd07a58560b9",
231
+ "model": "speech31/wav2vec2-large-TIMIT-IPA2\"",
232
+ "subset": "test",
233
+ "submission_name": "fine-tuned version of facebook/wav2vec2-large on the None dataset",
234
+ "github_url": "https://huggingface.co/speech31/wav2vec2-large-TIMIT-IPA2",
235
+ "status": "failed",
236
+ "submitted_at": "2024-12-20T21:54:25.670276",
237
+ "error": "Evaluation failed: Incorrect path_or_model_id: 'speech31/wav2vec2-large-TIMIT-IPA2\"'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
238
+ },
239
+ {
240
+ "id": "c89bcefc-3884-435a-a54c-24297fe6f041",
241
+ "model": "speech31/wav2vec2-large-TIMIT-IPA2",
242
+ "subset": "test",
243
+ "submission_name": "fine-tuned version of facebook/wav2vec2-large on the None dataset",
244
+ "github_url": "https://huggingface.co/speech31/wav2vec2-large-TIMIT-IPA2",
245
+ "status": "completed",
246
+ "submitted_at": "2024-12-20T21:54:38.559569"
247
+ },
248
+ {
249
+ "id": "81fa94f8-94ae-4601-952c-24abaddaf691",
250
+ "model": "ginic/vary_individuals_young_only_3_wav2vec2-large-xlsr-buckeye-ipa",
251
+ "subset": "test",
252
+ "submission_name": "ginic model, facebook/wav2vec2-large-xlsr-53 fine tuned",
253
+ "github_url": "https://huggingface.co/ginic/vary_individuals_young_only_3_wav2vec2-large-xlsr-buckeye-ipa",
254
+ "status": "processing",
255
+ "submitted_at": "2024-12-21T01:15:41.870875"
256
  }
257
  ]
utils_display.py DELETED
@@ -1,31 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- # These classes are for user facing column names, to avoid having to change them
4
- # all around the code when a modif is needed
5
- @dataclass
6
- class ColumnContent:
7
- name: str
8
- type: str
9
-
10
- def fields(raw_class):
11
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
12
-
13
- @dataclass(frozen=True)
14
- class AutoEvalColumn: # Auto evals column
15
- model = ColumnContent("Model", "markdown")
16
- avg_per = ColumnContent("Average PER ⬇️", "number")
17
- avg_wped = ColumnContent("Average PWED ⬇️", "number")
18
-
19
-
20
- def make_clickable_model(model_name):
21
- link = f"https://huggingface.co/{model_name}"
22
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
23
-
24
- def styled_error(error):
25
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
26
-
27
- def styled_warning(warn):
28
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
29
-
30
- def styled_message(message):
31
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"