Spaces:

luluw
/

Conformer-CTC-Small

Sleeping

App Files Files Community

luluw commited on Apr 24

Commit

dcae561

0 Parent(s):

Conformer Space

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +14 -0
app.py +84 -0
decoder.py +50 -0
optimized_model.pt +3 -0
requirements.txt +7 -0
tokens.txt +29 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Conformer CTC Small
+emoji: 📈
+colorFrom: green
+colorTo: green
+sdk: gradio
+sdk_version: 5.26.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: ASR model based on Conformer
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torchaudio
+import gradio as gr
+from torchaudio.transforms import Resample
+from torchaudio.models.decoder import download_pretrained_files, ctc_decoder
+from src.Conformer import get_featurizer
+# Constants for decoding
+LM_WEIGHT = 1.23
+WORD_SCORE = -0.26
+def preprocess_audio(audio_file, featurizer, target_sample_rate=16000):
+    try:
+        waveform, sample_rate = torchaudio.load(audio_file)
+        if sample_rate != target_sample_rate:
+            waveform = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)(waveform)
+        return featurizer(waveform).permute(0, 2, 1)
+    except Exception as e:
+        raise ValueError(f"Error in preprocessing audio: {e}")
+def decode_emission(emission, tokens, files):
+    try:
+        beam_search_decoder = ctc_decoder(
+            lexicon=files.lexicon,
+            tokens=tokens,
+            lm=files.lm,
+            nbest=1,
+            beam_size=100,
+            beam_threshold=50,
+            beam_size_token=25,
+            lm_weight=LM_WEIGHT,
+            word_score=WORD_SCORE,
+        )
+        beam_search_result = beam_search_decoder(emission)
+        return " ".join(beam_search_result[0][0].words).strip()
+    except Exception as e:
+        raise ValueError(f"Error in decoding: {e}")
+def transcribe(audio_file, model, featurizer, tokens, files):
+    try:
+        waveform = preprocess_audio(audio_file, featurizer)
+        emission = model(waveform)
+        return decode_emission(emission, tokens, files)
+    except Exception as e:
+        return f"Error processing audio: {e}"
+def launch_app(model_path, token_path="assets/tokens.txt", share=False):
+    model = torch.jit.load(model_path)
+    model.eval().to('cpu')
+    with open(token_path, 'r') as f:
+        tokens = f.read().splitlines()
+    files = download_pretrained_files("librispeech-4-gram")
+    featurizer = get_featurizer()
+    def gradio_transcribe(audio_file):
+        return transcribe(audio_file, model, featurizer, tokens, files)
+    interface = gr.Interface(
+        fn=gradio_transcribe,
+        inputs=gr.Audio(sources="microphone", type="filepath", label="Speak into the microphone"),
+        outputs="text",
+        title="Conformer-Small ASR Model",
+        description="Speak into the microphone, and the model will transcribe your speech.",
+    )
+    interface.launch(share=share)
+if __name__ == "__main__":
+    try:
+        model_path = "optimized_model.pt"
+        token_path = "tokens.txt"
+        share = False
+        launch_app(model_path, token_path, share)
+    except Exception as e:
+        raise ValueError(f"Fatal error: {e}")

decoder.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torchaudio
+from torchaudio.models.decoder import ctc_decoder, download_pretrained_files
+class SpeechRecognitionEngine:
+    """
+    ASR engine to transcribe recorded audio.
+    """
+    def __init__(self, model_file, token_path):
+        self.model = torch.jit.load(model_file)
+        self.model.eval().to('cpu')
+        # Load decoder files and tokens
+        files = download_pretrained_files("librispeech-4-gram")
+        with open(token_path, 'r') as f:
+            tokens = f.read().splitlines()
+        self.decoder = ctc_decoder(
+            lexicon=files.lexicon,
+            tokens=tokens,
+            lm=files.lm,
+            nbest=1,
+            beam_size=50,
+            beam_threshold=25,
+            beam_size_token=20,
+            lm_weight=1.23,
+            word_score=-0.26,
+        )
+        print("Loaded beam search with Ken LM")
+    def transcribe(self, model, featurizer, filename):
+        """
+        Transcribe audio from a file using the ASR model.
+        """
+        try:
+            waveform, _ = torchaudio.load(filename)
+            mel = featurizer(waveform).permute(0, 2, 1)  # Prepare mel features
+            with torch.inference_mode():
+                out = model(mel)
+                results = self.decoder(out)
+                return " ".join(results[0][0].words).strip()
+        except Exception as e:
+            raise RuntimeError(f"Error during transcription: {e}")

optimized_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:facf55228dc50431cae1d1d02fd36c809a16ee34ef849e4a7c2f8cb1bfd0aaab
+size 44005738

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+torchaudio
+numpy
+soundfile
+sox
+gradio
+flashlight-text

tokens.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+'
+|
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+-