Yehor Smoliakov commited on
Commit
287ac53
·
1 Parent(s): 5879d4f
Files changed (12) hide show
  1. .gitattributes +6 -35
  2. .gitignore +5 -0
  3. README.md +24 -8
  4. app.py +243 -0
  5. example_1.wav +0 -0
  6. example_2.wav +0 -0
  7. example_3.wav +0 -0
  8. example_4.wav +0 -0
  9. example_5.wav +0 -0
  10. example_6.wav +0 -0
  11. requirements-dev.txt +1 -0
  12. requirements.txt +11 -0
.gitattributes CHANGED
@@ -1,35 +1,6 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ sample_1.wav filter=lfs diff=lfs merge=lfs -text
2
+ sample_2.wav filter=lfs diff=lfs merge=lfs -text
3
+ sample_3.wav filter=lfs diff=lfs merge=lfs -text
4
+ sample_4.wav filter=lfs diff=lfs merge=lfs -text
5
+ sample_5.wav filter=lfs diff=lfs merge=lfs -text
6
+ sample_6.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .idea/
2
+ .venv/
3
+ .ruff_cache/
4
+
5
+ flagged/
README.md CHANGED
@@ -1,13 +1,29 @@
1
  ---
2
- title: W2v Bert 2.0 Uk V2.1 Demo
3
- emoji: 🌍
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.40.0
8
  app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Speech-to-Text for Ukrainian v2
3
+ emoji: 🔥
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: gradio
 
7
  app_file: app.py
8
+ pinned: true
9
+ sdk_version: 4.39.0
10
  ---
11
 
12
+ ## Install
13
+
14
+ ```shell
15
+ uv venv --python 3.10
16
+
17
+ source .venv/bin/activate
18
+
19
+ uv pip install -r requirements.txt
20
+
21
+ # in development mode
22
+ uv pip install -r requirements-dev.txt
23
+ ```
24
+
25
+ ## Run
26
+
27
+ ```shell
28
+ python app.py
29
+ ```
app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+
4
+ from importlib.metadata import version
5
+
6
+ import torch
7
+ import torchaudio
8
+ import torchaudio.transforms as T
9
+
10
+ import gradio as gr
11
+
12
+ from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
13
+
14
+ # Config
15
+ model_name = "Yehor/w2v-bert-2.0-uk-v2.1"
16
+
17
+ min_duration = 0.5
18
+ max_duration = 60
19
+
20
+ concurrency_limit = 5
21
+ use_torch_compile = False
22
+
23
+ # Torch
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
26
+
27
+ # Load the model
28
+ asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype).to(
29
+ device
30
+ )
31
+ processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
32
+
33
+ if use_torch_compile:
34
+ asr_model = torch.compile(asr_model)
35
+
36
+ # Elements
37
+ examples = [
38
+ "example_1.wav",
39
+ "example_2.wav",
40
+ "example_3.wav",
41
+ "example_4.wav",
42
+ "example_5.wav",
43
+ "example_6.wav",
44
+ ]
45
+
46
+ examples_table = """
47
+ | File | Text |
48
+ | ------------- | ------------- |
49
+ | `example_1.wav` | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
50
+ | `example_2.wav` | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування |
51
+ | `example_3.wav` | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні |
52
+ | `example_4.wav` | використовує на фронті все що має і хімічна зброя не вийняток тож з чим маємо справу розбиралася марія моганисян |
53
+ | `example_5.wav` | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
54
+ | `example_6.wav` | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
55
+ """.strip()
56
+
57
+ # https://www.tablesgenerator.com/markdown_tables
58
+ authors_table = """
59
+ ## Authors
60
+
61
+ Follow them in social networks and **contact** if you need any help or have any questions:
62
+
63
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
64
+ |-------------------------------------------------------------------------------------------------|
65
+ | https://t.me/smlkw in Telegram |
66
+ | https://x.com/yehor_smoliakov at X |
67
+ | https://github.com/egorsmkv at GitHub |
68
+ | https://huggingface.co/Yehor at Hugging Face |
69
+ | or use [email protected] |
70
+ """.strip()
71
+
72
+ description_head = f"""
73
+ # Speech-to-Text for Ukrainian v2
74
+
75
+ ## Overview
76
+
77
+ This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2.1 model to recognize audio files.
78
+
79
+ > Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds.
80
+ """.strip()
81
+
82
+ description_foot = f"""
83
+ ## Community
84
+
85
+ - Join our Discord server where we talk about AI/ML/DL: https://discord.gg/yVAjkBgmt4
86
+ - Join our Speech Recognition group in Telegram: https://t.me/speech_recognition_uk
87
+
88
+ ## More
89
+
90
+ Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
91
+
92
+ {authors_table}
93
+ """.strip()
94
+
95
+ transcription_value = """
96
+ Recognized text will appear here.
97
+
98
+ Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record own voice.
99
+ """.strip()
100
+
101
+ tech_env = f"""
102
+ #### Environment
103
+
104
+ - Python: {sys.version}
105
+ - Torch device: {device}
106
+ - Torch dtype: {torch_dtype}
107
+ - Use torch.compile: {use_torch_compile}
108
+ """.strip()
109
+
110
+ tech_libraries = f"""
111
+ #### Libraries
112
+
113
+ - torch: {version('torch')}
114
+ - torchaudio: {version('torchaudio')}
115
+ - transformers: {version('transformers')}
116
+ - gradio: {version('gradio')}
117
+ """.strip()
118
+
119
+
120
+ def inference(audio_path, progress=gr.Progress()):
121
+ if not audio_path:
122
+ raise gr.Error("Please upload an audio file.")
123
+
124
+ gr.Info("Starting recognition", duration=2)
125
+
126
+ progress(0, desc="Recognizing")
127
+
128
+ meta = torchaudio.info(audio_path)
129
+ duration = meta.num_frames / meta.sample_rate
130
+
131
+ if duration < min_duration:
132
+ raise gr.Error(
133
+ f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
134
+ )
135
+ if duration > max_duration:
136
+ raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
137
+
138
+ paths = [
139
+ audio_path,
140
+ ]
141
+
142
+ results = []
143
+
144
+ for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
145
+ t0 = time.time()
146
+
147
+ meta = torchaudio.info(audio_path)
148
+ audio_duration = meta.num_frames / meta.sample_rate
149
+
150
+ audio_input, sr = torchaudio.load(path)
151
+
152
+ if meta.num_channels > 1:
153
+ audio_input = torch.mean(audio_input, dim=0, keepdim=True)
154
+
155
+ if meta.sample_rate != 16_000:
156
+ resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
157
+ audio_input = resampler(audio_input)
158
+
159
+ audio_input = audio_input.squeeze().numpy()
160
+
161
+ features = processor([audio_input], sampling_rate=16_000).input_features
162
+ features = torch.tensor(features).to(device)
163
+
164
+ if torch_dtype == torch.float16:
165
+ features = features.half()
166
+
167
+ with torch.inference_mode():
168
+ logits = asr_model(features).logits
169
+
170
+ predicted_ids = torch.argmax(logits, dim=-1)
171
+ predictions = processor.batch_decode(predicted_ids)
172
+
173
+ if not predictions:
174
+ predictions = "-"
175
+
176
+ elapsed_time = round(time.time() - t0, 2)
177
+ rtf = round(elapsed_time / audio_duration, 4)
178
+ audio_duration = round(audio_duration, 2)
179
+
180
+ results.append(
181
+ {
182
+ "path": path.split("/")[-1],
183
+ "transcription": "\n".join(predictions),
184
+ "audio_duration": audio_duration,
185
+ "rtf": rtf,
186
+ }
187
+ )
188
+
189
+ gr.Info("Finished!", duration=2)
190
+
191
+ result_texts = []
192
+
193
+ for result in results:
194
+ result_texts.append(f'**{result["path"]}**')
195
+ result_texts.append("\n\n")
196
+ result_texts.append(f'> {result["transcription"]}')
197
+ result_texts.append("\n\n")
198
+ result_texts.append(f'**Audio duration**: {result["audio_duration"]}')
199
+ result_texts.append("\n")
200
+ result_texts.append(f'**Real-Time Factor**: {result["rtf"]}')
201
+
202
+ return "\n".join(result_texts)
203
+
204
+
205
+ demo = gr.Blocks(
206
+ title="Speech-to-Text for Ukrainian",
207
+ analytics_enabled=False,
208
+ theme=gr.themes.Base(),
209
+ )
210
+
211
+ with demo:
212
+ gr.Markdown(description_head)
213
+
214
+ gr.Markdown("## Usage")
215
+
216
+ with gr.Row():
217
+ audio_file = gr.Audio(label="Audio file", type="filepath")
218
+ transcription = gr.Markdown(
219
+ label="Transcription",
220
+ value=transcription_value,
221
+ )
222
+
223
+ gr.Button("Recognize").click(
224
+ inference,
225
+ concurrency_limit=concurrency_limit,
226
+ inputs=audio_file,
227
+ outputs=transcription,
228
+ )
229
+
230
+ with gr.Row():
231
+ gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
232
+
233
+ gr.Markdown(examples_table)
234
+
235
+ gr.Markdown(description_foot)
236
+
237
+ gr.Markdown("### Gradio app uses the following technologies:")
238
+ gr.Markdown(tech_env)
239
+ gr.Markdown(tech_libraries)
240
+
241
+ if __name__ == "__main__":
242
+ demo.queue()
243
+ demo.launch()
example_1.wav ADDED
Binary file (273 kB). View file
 
example_2.wav ADDED
Binary file (200 kB). View file
 
example_3.wav ADDED
Binary file (193 kB). View file
 
example_4.wav ADDED
Binary file (241 kB). View file
 
example_5.wav ADDED
Binary file (193 kB). View file
 
example_6.wav ADDED
Binary file (186 kB). View file
 
requirements-dev.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ruff
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+
3
+ torch
4
+ torchaudio
5
+
6
+ soundfile
7
+
8
+ triton
9
+ setuptools
10
+
11
+ transformers