luluw commited on
Commit
dcae561
·
0 Parent(s):

Conformer Space

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README.md +14 -0
  3. app.py +84 -0
  4. decoder.py +50 -0
  5. optimized_model.pt +3 -0
  6. requirements.txt +7 -0
  7. tokens.txt +29 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Conformer CTC Small
3
+ emoji: 📈
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.26.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: ASR model based on Conformer
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+
5
+ from torchaudio.transforms import Resample
6
+ from torchaudio.models.decoder import download_pretrained_files, ctc_decoder
7
+
8
+ from src.Conformer import get_featurizer
9
+
10
+ # Constants for decoding
11
+ LM_WEIGHT = 1.23
12
+ WORD_SCORE = -0.26
13
+
14
+
15
+ def preprocess_audio(audio_file, featurizer, target_sample_rate=16000):
16
+ try:
17
+ waveform, sample_rate = torchaudio.load(audio_file)
18
+ if sample_rate != target_sample_rate:
19
+ waveform = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)(waveform)
20
+ return featurizer(waveform).permute(0, 2, 1)
21
+ except Exception as e:
22
+ raise ValueError(f"Error in preprocessing audio: {e}")
23
+
24
+
25
+ def decode_emission(emission, tokens, files):
26
+ try:
27
+ beam_search_decoder = ctc_decoder(
28
+ lexicon=files.lexicon,
29
+ tokens=tokens,
30
+ lm=files.lm,
31
+ nbest=1,
32
+ beam_size=100,
33
+ beam_threshold=50,
34
+ beam_size_token=25,
35
+ lm_weight=LM_WEIGHT,
36
+ word_score=WORD_SCORE,
37
+ )
38
+ beam_search_result = beam_search_decoder(emission)
39
+ return " ".join(beam_search_result[0][0].words).strip()
40
+ except Exception as e:
41
+ raise ValueError(f"Error in decoding: {e}")
42
+
43
+
44
+ def transcribe(audio_file, model, featurizer, tokens, files):
45
+ try:
46
+ waveform = preprocess_audio(audio_file, featurizer)
47
+ emission = model(waveform)
48
+ return decode_emission(emission, tokens, files)
49
+ except Exception as e:
50
+ return f"Error processing audio: {e}"
51
+
52
+
53
+ def launch_app(model_path, token_path="assets/tokens.txt", share=False):
54
+ model = torch.jit.load(model_path)
55
+ model.eval().to('cpu')
56
+
57
+ with open(token_path, 'r') as f:
58
+ tokens = f.read().splitlines()
59
+
60
+ files = download_pretrained_files("librispeech-4-gram")
61
+ featurizer = get_featurizer()
62
+
63
+ def gradio_transcribe(audio_file):
64
+ return transcribe(audio_file, model, featurizer, tokens, files)
65
+
66
+ interface = gr.Interface(
67
+ fn=gradio_transcribe,
68
+ inputs=gr.Audio(sources="microphone", type="filepath", label="Speak into the microphone"),
69
+ outputs="text",
70
+ title="Conformer-Small ASR Model",
71
+ description="Speak into the microphone, and the model will transcribe your speech.",
72
+ )
73
+
74
+ interface.launch(share=share)
75
+
76
+
77
+ if __name__ == "__main__":
78
+ try:
79
+ model_path = "optimized_model.pt"
80
+ token_path = "tokens.txt"
81
+ share = False
82
+ launch_app(model_path, token_path, share)
83
+ except Exception as e:
84
+ raise ValueError(f"Fatal error: {e}")
decoder.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+
4
+ from torchaudio.models.decoder import ctc_decoder, download_pretrained_files
5
+
6
+
7
+ class SpeechRecognitionEngine:
8
+ """
9
+ ASR engine to transcribe recorded audio.
10
+ """
11
+ def __init__(self, model_file, token_path):
12
+ self.model = torch.jit.load(model_file)
13
+ self.model.eval().to('cpu')
14
+
15
+ # Load decoder files and tokens
16
+ files = download_pretrained_files("librispeech-4-gram")
17
+
18
+ with open(token_path, 'r') as f:
19
+ tokens = f.read().splitlines()
20
+
21
+ self.decoder = ctc_decoder(
22
+ lexicon=files.lexicon,
23
+ tokens=tokens,
24
+ lm=files.lm,
25
+ nbest=1,
26
+ beam_size=50,
27
+ beam_threshold=25,
28
+ beam_size_token=20,
29
+ lm_weight=1.23,
30
+ word_score=-0.26,
31
+ )
32
+
33
+ print("Loaded beam search with Ken LM")
34
+
35
+
36
+ def transcribe(self, model, featurizer, filename):
37
+ """
38
+ Transcribe audio from a file using the ASR model.
39
+ """
40
+
41
+ try:
42
+ waveform, _ = torchaudio.load(filename)
43
+ mel = featurizer(waveform).permute(0, 2, 1) # Prepare mel features
44
+ with torch.inference_mode():
45
+ out = model(mel)
46
+ results = self.decoder(out)
47
+ return " ".join(results[0][0].words).strip()
48
+ except Exception as e:
49
+ raise RuntimeError(f"Error during transcription: {e}")
50
+
optimized_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:facf55228dc50431cae1d1d02fd36c809a16ee34ef849e4a7c2f8cb1bfd0aaab
3
+ size 44005738
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ numpy
4
+ soundfile
5
+ sox
6
+ gradio
7
+ flashlight-text
tokens.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '
2
+ |
3
+ a
4
+ b
5
+ c
6
+ d
7
+ e
8
+ f
9
+ g
10
+ h
11
+ i
12
+ j
13
+ k
14
+ l
15
+ m
16
+ n
17
+ o
18
+ p
19
+ q
20
+ r
21
+ s
22
+ t
23
+ u
24
+ v
25
+ w
26
+ x
27
+ y
28
+ z
29
+ -