Spaces:

Yehor
/

w2v-bert-uk-v2.1

Running

App Files Files Community

Yehor Smoliakov commited on Aug 8, 2024

Commit

287ac53

1 Parent(s): 5879d4f

Init

Browse files

Files changed (12) hide show

.gitattributes +6 -35
.gitignore +5 -0
README.md +24 -8
app.py +243 -0
example_1.wav +0 -0
example_2.wav +0 -0
example_3.wav +0 -0
example_4.wav +0 -0
example_5.wav +0 -0
example_6.wav +0 -0
requirements-dev.txt +1 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+sample_1.wav filter=lfs diff=lfs merge=lfs -text
+sample_2.wav filter=lfs diff=lfs merge=lfs -text
+sample_3.wav filter=lfs diff=lfs merge=lfs -text
+sample_4.wav filter=lfs diff=lfs merge=lfs -text
+sample_5.wav filter=lfs diff=lfs merge=lfs -text
+sample_6.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.idea/
+.venv/
+.ruff_cache/
+flagged/

README.md CHANGED Viewed

@@ -1,13 +1,29 @@
 ---
-title: W2v Bert 2.0 Uk V2.1 Demo
-emoji: 🌍
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
-sdk_version: 4.40.0
 app_file: app.py
-pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Speech-to-Text for Ukrainian v2
+emoji: 🔥
+colorFrom: blue
+colorTo: yellow
 sdk: gradio
 app_file: app.py
+pinned: true
+sdk_version: 4.39.0
 ---
+## Install
+```shell
+uv venv --python 3.10
+source .venv/bin/activate
+uv pip install -r requirements.txt
+# in development mode
+uv pip install -r requirements-dev.txt
+```
+## Run
+```shell
+python app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import sys
+import time
+from importlib.metadata import version
+import torch
+import torchaudio
+import torchaudio.transforms as T
+import gradio as gr
+from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
+# Config
+model_name = "Yehor/w2v-bert-2.0-uk-v2.1"
+min_duration = 0.5
+max_duration = 60
+concurrency_limit = 5
+use_torch_compile = False
+# Torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load the model
+asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype).to(
+    device
+)
+processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
+if use_torch_compile:
+    asr_model = torch.compile(asr_model)
+# Elements
+examples = [
+    "example_1.wav",
+    "example_2.wav",
+    "example_3.wav",
+    "example_4.wav",
+    "example_5.wav",
+    "example_6.wav",
+]
+examples_table = """
+| File  | Text |
+| ------------- | ------------- |
+| `example_1.wav`  | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
+| `example_2.wav`  | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування |
+| `example_3.wav`  | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні  |
+| `example_4.wav`  | використовує на фронті все що має і хімічна зброя не вийняток тож з чим маємо справу розбиралася марія моганисян |
+| `example_5.wav`  | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
+| `example_6.wav`  | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
+""".strip()
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = """
+## Authors
+Follow them in social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram                                                                  |
+| https://x.com/yehor_smoliakov at X                                                              |
+| https://github.com/egorsmkv at GitHub                                                           |
+| https://huggingface.co/Yehor at Hugging Face                                                    |
+| or use [email protected]                                                                       |
+""".strip()
+description_head = f"""
+# Speech-to-Text for Ukrainian v2
+## Overview
+This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2.1 model to recognize audio files.
+> Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds.
+""".strip()
+description_foot = f"""
+## Community
+- Join our Discord server where we talk about AI/ML/DL: https://discord.gg/yVAjkBgmt4
+- Join our Speech Recognition group in Telegram: https://t.me/speech_recognition_uk
+## More
+Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
+{authors_table}
+""".strip()
+transcription_value = """
+Recognized text will appear here.
+Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record own voice.
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+- Torch device: {device}
+- Torch dtype: {torch_dtype}
+- Use torch.compile: {use_torch_compile}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- torch: {version('torch')}
+- torchaudio: {version('torchaudio')}
+- transformers: {version('transformers')}
+- gradio: {version('gradio')}
+""".strip()
+def inference(audio_path, progress=gr.Progress()):
+    if not audio_path:
+        raise gr.Error("Please upload an audio file.")
+    gr.Info("Starting recognition", duration=2)
+    progress(0, desc="Recognizing")
+    meta = torchaudio.info(audio_path)
+    duration = meta.num_frames / meta.sample_rate
+    if duration < min_duration:
+        raise gr.Error(
+            f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
+        )
+    if duration > max_duration:
+        raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
+    paths = [
+        audio_path,
+    ]
+    results = []
+    for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
+        t0 = time.time()
+        meta = torchaudio.info(audio_path)
+        audio_duration = meta.num_frames / meta.sample_rate
+        audio_input, sr = torchaudio.load(path)
+        if meta.num_channels > 1:
+            audio_input = torch.mean(audio_input, dim=0, keepdim=True)
+        if meta.sample_rate != 16_000:
+            resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
+            audio_input = resampler(audio_input)
+        audio_input = audio_input.squeeze().numpy()
+        features = processor([audio_input], sampling_rate=16_000).input_features
+        features = torch.tensor(features).to(device)
+        if torch_dtype == torch.float16:
+            features = features.half()
+        with torch.inference_mode():
+            logits = asr_model(features).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predictions = processor.batch_decode(predicted_ids)
+        if not predictions:
+            predictions = "-"
+        elapsed_time = round(time.time() - t0, 2)
+        rtf = round(elapsed_time / audio_duration, 4)
+        audio_duration = round(audio_duration, 2)
+        results.append(
+            {
+                "path": path.split("/")[-1],
+                "transcription": "\n".join(predictions),
+                "audio_duration": audio_duration,
+                "rtf": rtf,
+            }
+        )
+    gr.Info("Finished!", duration=2)
+    result_texts = []
+    for result in results:
+        result_texts.append(f'**{result["path"]}**')
+        result_texts.append("\n\n")
+        result_texts.append(f'> {result["transcription"]}')
+        result_texts.append("\n\n")
+        result_texts.append(f'**Audio duration**: {result["audio_duration"]}')
+        result_texts.append("\n")
+        result_texts.append(f'**Real-Time Factor**: {result["rtf"]}')
+    return "\n".join(result_texts)
+demo = gr.Blocks(
+    title="Speech-to-Text for Ukrainian",
+    analytics_enabled=False,
+    theme=gr.themes.Base(),
+)
+with demo:
+    gr.Markdown(description_head)
+    gr.Markdown("## Usage")
+    with gr.Row():
+        audio_file = gr.Audio(label="Audio file", type="filepath")
+        transcription = gr.Markdown(
+            label="Transcription",
+            value=transcription_value,
+        )
+    gr.Button("Recognize").click(
+        inference,
+        concurrency_limit=concurrency_limit,
+        inputs=audio_file,
+        outputs=transcription,
+    )
+    with gr.Row():
+        gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
+    gr.Markdown(examples_table)
+    gr.Markdown(description_foot)
+    gr.Markdown("### Gradio app uses the following technologies:")
+    gr.Markdown(tech_env)
+    gr.Markdown(tech_libraries)
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch()

example_1.wav ADDED Viewed

Binary file (273 kB). View file

example_2.wav ADDED Viewed

Binary file (200 kB). View file

example_3.wav ADDED Viewed

Binary file (193 kB). View file

example_4.wav ADDED Viewed

Binary file (241 kB). View file

example_5.wav ADDED Viewed

Binary file (193 kB). View file

example_6.wav ADDED Viewed

Binary file (186 kB). View file

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ruff

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+torch
+torchaudio
+soundfile
+triton
+setuptools
+transformers