Spaces:

dbdmg
/

robust-asr-it

Runtime error

App Files Files Community

MorenoLQ commited on Feb 23, 2022

Commit

7fce27b

1 Parent(s): 856bef6

Updated for file upload and missing inputs

Browse files

Files changed (3) hide show

app.py +66 -45
demo_example_1.mp3 +0 -0
gradio_queue.db +0 -0

app.py CHANGED Viewed

@@ -25,61 +25,78 @@ DICT_MODELS = {
 MODELS = sorted(DICT_MODELS.keys())
 CACHED_MODELS_BY_ID = {}
-def run(input_file, model_name, decoding_type, history):
-    logger.info(f"Running ASR {model_name}-{decoding_type} for {input_file}")
-    history = history or []
     model = DICT_MODELS.get(model_name)
-    if model is None:
-        history.append({
-            "error_message": f"Model size {model_size} not found for {language} language :("
-        })
-    elif decoding_type == "Guided by Language Model" and not model["has_lm"]:
         history.append({
-            "error_message": f"LM not available for {language} language :("
         })
     else:
-        # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
-        model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
-        if model_instance is None:
-            model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
-            CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
-        if decoding_type == "Guided by Language Model":
-            processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
-            asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
-                           feature_extractor=processor.feature_extractor, decoder=processor.decoder)
         else:
-            processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
-            asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
-                           feature_extractor=processor.feature_extractor, decoder=None)
-        transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
-        logger.info(f"Transcription for {input_file}: {transcription}")
-        history.append({
-            "model_id": model["model_id"],
-            "decoding_type": decoding_type,
-            "transcription": transcription,
-            "error_message": None
-        })
-    html_output = "<div class='result'>"
-    for item in history:
-        if item["error_message"] is not None:
-            html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
-        else:
-            url_suffix = " + Guided by Language Model" if item["decoding_type"] == "Guided by Language Model" else ""
-            html_output += "<div class='result_item result_item_success'>"
-            html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
-            html_output += f'{item["transcription"]}<br/>'
-            html_output += "</div>"
-    html_output += "</div>"
     return html_output, history
@@ -87,7 +104,8 @@ def run(input_file, model_name, decoding_type, history):
 gr.Interface(
     run,
     inputs=[
-        gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
         gr.inputs.Radio(label="Model", choices=MODELS),
         gr.inputs.Radio(label="Decoding type", choices=["Standard", "Guided by Language Model"]),
         "state"
@@ -106,5 +124,8 @@ gr.Interface(
     """,
     allow_screenshot=False,
     allow_flagging="never",
-    theme="huggingface"
 ).launch(enable_queue=True)

 MODELS = sorted(DICT_MODELS.keys())
 CACHED_MODELS_BY_ID = {}
+def build_html(history):
+    html_output = "<div class='result'>"
+    for item in history:
+        if item["error_message"] is not None:
+            html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
+        else:
+            url_suffix = " + Guided by Language Model" if item["decoding_type"] == "Guided by Language Model" else ""
+            html_output += "<div class='result_item result_item_success'>"
+            html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
+            html_output += f'{item["transcription"]}<br/>'
+            html_output += "</div>"
+    html_output += "</div>"
+    return html_output
+def run(uploaded_file, input_file, model_name, decoding_type, history):
     model = DICT_MODELS.get(model_name)
+    history = history or []
+    if uploaded_file is None and input_file is None:
         history.append({
+            "model_id": model["model_id"],
+            "decoding_type": decoding_type,
+            "transcription": "",
+            "error_message": "No input provided."
         })
     else:
+        if input_file is None:
+            input_file = uploaded_file
+        logger.info(f"Running ASR {model_name}-{decoding_type} for {input_file}")
+        history = history or []
+        if model is None:
+            history.append({
+                "error_message": f"Model size {model_size} not found for {language} language :("
+            })
+        elif decoding_type == "Guided by Language Model" and not model["has_lm"]:
+            history.append({
+                "error_message": f"LM not available for {language} language :("
+            })
         else:
+            # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
+            model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
+            if model_instance is None:
+                model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
+                CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
+            if decoding_type == "Guided by Language Model":
+                processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
+                asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
+                            feature_extractor=processor.feature_extractor, decoder=processor.decoder)
+            else:
+                processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
+                asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
+                            feature_extractor=processor.feature_extractor, decoder=None)
+            transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
+            logger.info(f"Transcription for {input_file}: {transcription}")
+            history.append({
+                "model_id": model["model_id"],
+                "decoding_type": decoding_type,
+                "transcription": transcription,
+                "error_message": None
+            })
+    html_output = build_html(history)
     return html_output, history
 gr.Interface(
     run,
     inputs=[
+        gr.inputs.Audio(source="upload", type='filepath', optional=True),
+        gr.inputs.Audio(source="microphone", type="filepath", label="Record something...", optional=True),
         gr.inputs.Radio(label="Model", choices=MODELS),
         gr.inputs.Radio(label="Decoding type", choices=["Standard", "Guided by Language Model"]),
         "state"
     """,
     allow_screenshot=False,
     allow_flagging="never",
+    theme="huggingface",
+    examples = [
+        ['demo_example_1.mp3', 'demo_example_1.mp3', 'robust-300m', 'Guided by Language Model']
+    ]
 ).launch(enable_queue=True)

demo_example_1.mp3 ADDED Viewed

Binary file (121 kB). View file

gradio_queue.db CHANGED Viewed

Binary files a/gradio_queue.db and b/gradio_queue.db differ