Spaces:
Sleeping
Sleeping
Commit
·
dcae561
0
Parent(s):
Conformer Space
Browse files- .gitattributes +1 -0
- README.md +14 -0
- app.py +84 -0
- decoder.py +50 -0
- optimized_model.pt +3 -0
- requirements.txt +7 -0
- tokens.txt +29 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Conformer CTC Small
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.26.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
short_description: ASR model based on Conformer
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from torchaudio.transforms import Resample
|
6 |
+
from torchaudio.models.decoder import download_pretrained_files, ctc_decoder
|
7 |
+
|
8 |
+
from src.Conformer import get_featurizer
|
9 |
+
|
10 |
+
# Constants for decoding
|
11 |
+
LM_WEIGHT = 1.23
|
12 |
+
WORD_SCORE = -0.26
|
13 |
+
|
14 |
+
|
15 |
+
def preprocess_audio(audio_file, featurizer, target_sample_rate=16000):
|
16 |
+
try:
|
17 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
18 |
+
if sample_rate != target_sample_rate:
|
19 |
+
waveform = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)(waveform)
|
20 |
+
return featurizer(waveform).permute(0, 2, 1)
|
21 |
+
except Exception as e:
|
22 |
+
raise ValueError(f"Error in preprocessing audio: {e}")
|
23 |
+
|
24 |
+
|
25 |
+
def decode_emission(emission, tokens, files):
|
26 |
+
try:
|
27 |
+
beam_search_decoder = ctc_decoder(
|
28 |
+
lexicon=files.lexicon,
|
29 |
+
tokens=tokens,
|
30 |
+
lm=files.lm,
|
31 |
+
nbest=1,
|
32 |
+
beam_size=100,
|
33 |
+
beam_threshold=50,
|
34 |
+
beam_size_token=25,
|
35 |
+
lm_weight=LM_WEIGHT,
|
36 |
+
word_score=WORD_SCORE,
|
37 |
+
)
|
38 |
+
beam_search_result = beam_search_decoder(emission)
|
39 |
+
return " ".join(beam_search_result[0][0].words).strip()
|
40 |
+
except Exception as e:
|
41 |
+
raise ValueError(f"Error in decoding: {e}")
|
42 |
+
|
43 |
+
|
44 |
+
def transcribe(audio_file, model, featurizer, tokens, files):
|
45 |
+
try:
|
46 |
+
waveform = preprocess_audio(audio_file, featurizer)
|
47 |
+
emission = model(waveform)
|
48 |
+
return decode_emission(emission, tokens, files)
|
49 |
+
except Exception as e:
|
50 |
+
return f"Error processing audio: {e}"
|
51 |
+
|
52 |
+
|
53 |
+
def launch_app(model_path, token_path="assets/tokens.txt", share=False):
|
54 |
+
model = torch.jit.load(model_path)
|
55 |
+
model.eval().to('cpu')
|
56 |
+
|
57 |
+
with open(token_path, 'r') as f:
|
58 |
+
tokens = f.read().splitlines()
|
59 |
+
|
60 |
+
files = download_pretrained_files("librispeech-4-gram")
|
61 |
+
featurizer = get_featurizer()
|
62 |
+
|
63 |
+
def gradio_transcribe(audio_file):
|
64 |
+
return transcribe(audio_file, model, featurizer, tokens, files)
|
65 |
+
|
66 |
+
interface = gr.Interface(
|
67 |
+
fn=gradio_transcribe,
|
68 |
+
inputs=gr.Audio(sources="microphone", type="filepath", label="Speak into the microphone"),
|
69 |
+
outputs="text",
|
70 |
+
title="Conformer-Small ASR Model",
|
71 |
+
description="Speak into the microphone, and the model will transcribe your speech.",
|
72 |
+
)
|
73 |
+
|
74 |
+
interface.launch(share=share)
|
75 |
+
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
try:
|
79 |
+
model_path = "optimized_model.pt"
|
80 |
+
token_path = "tokens.txt"
|
81 |
+
share = False
|
82 |
+
launch_app(model_path, token_path, share)
|
83 |
+
except Exception as e:
|
84 |
+
raise ValueError(f"Fatal error: {e}")
|
decoder.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
|
4 |
+
from torchaudio.models.decoder import ctc_decoder, download_pretrained_files
|
5 |
+
|
6 |
+
|
7 |
+
class SpeechRecognitionEngine:
|
8 |
+
"""
|
9 |
+
ASR engine to transcribe recorded audio.
|
10 |
+
"""
|
11 |
+
def __init__(self, model_file, token_path):
|
12 |
+
self.model = torch.jit.load(model_file)
|
13 |
+
self.model.eval().to('cpu')
|
14 |
+
|
15 |
+
# Load decoder files and tokens
|
16 |
+
files = download_pretrained_files("librispeech-4-gram")
|
17 |
+
|
18 |
+
with open(token_path, 'r') as f:
|
19 |
+
tokens = f.read().splitlines()
|
20 |
+
|
21 |
+
self.decoder = ctc_decoder(
|
22 |
+
lexicon=files.lexicon,
|
23 |
+
tokens=tokens,
|
24 |
+
lm=files.lm,
|
25 |
+
nbest=1,
|
26 |
+
beam_size=50,
|
27 |
+
beam_threshold=25,
|
28 |
+
beam_size_token=20,
|
29 |
+
lm_weight=1.23,
|
30 |
+
word_score=-0.26,
|
31 |
+
)
|
32 |
+
|
33 |
+
print("Loaded beam search with Ken LM")
|
34 |
+
|
35 |
+
|
36 |
+
def transcribe(self, model, featurizer, filename):
|
37 |
+
"""
|
38 |
+
Transcribe audio from a file using the ASR model.
|
39 |
+
"""
|
40 |
+
|
41 |
+
try:
|
42 |
+
waveform, _ = torchaudio.load(filename)
|
43 |
+
mel = featurizer(waveform).permute(0, 2, 1) # Prepare mel features
|
44 |
+
with torch.inference_mode():
|
45 |
+
out = model(mel)
|
46 |
+
results = self.decoder(out)
|
47 |
+
return " ".join(results[0][0].words).strip()
|
48 |
+
except Exception as e:
|
49 |
+
raise RuntimeError(f"Error during transcription: {e}")
|
50 |
+
|
optimized_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:facf55228dc50431cae1d1d02fd36c809a16ee34ef849e4a7c2f8cb1bfd0aaab
|
3 |
+
size 44005738
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchaudio
|
3 |
+
numpy
|
4 |
+
soundfile
|
5 |
+
sox
|
6 |
+
gradio
|
7 |
+
flashlight-text
|
tokens.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'
|
2 |
+
|
|
3 |
+
a
|
4 |
+
b
|
5 |
+
c
|
6 |
+
d
|
7 |
+
e
|
8 |
+
f
|
9 |
+
g
|
10 |
+
h
|
11 |
+
i
|
12 |
+
j
|
13 |
+
k
|
14 |
+
l
|
15 |
+
m
|
16 |
+
n
|
17 |
+
o
|
18 |
+
p
|
19 |
+
q
|
20 |
+
r
|
21 |
+
s
|
22 |
+
t
|
23 |
+
u
|
24 |
+
v
|
25 |
+
w
|
26 |
+
x
|
27 |
+
y
|
28 |
+
z
|
29 |
+
-
|