import json import gradio as gr from huggingface_hub import snapshot_download from omegaconf import OmegaConf from vosk import KaldiRecognizer, Model def load_vosk(model_id: str): model_dir = snapshot_download(model_id) return Model(model_path=model_dir) OmegaConf.register_new_resolver("load_vosk", load_vosk) models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml")) def automatic_speech_recognition( model_id: str, dialect_id: str, stream: str, new_chunk: str ): if isinstance(models_config[model_id]["model"], dict): model = models_config[model_id]["model"][dialect_id] else: model = models_config[model_id]["model"] sample_rate, audio_array = new_chunk if audio_array.ndim == 2: audio_array = audio_array[:, 0] audio_bytes = audio_array.tobytes() if stream is None: rec = KaldiRecognizer(model, sample_rate) rec.SetWords(True) result = [] else: rec, result = stream if rec.AcceptWaveform(audio_bytes): text_result = json.loads(rec.Result())["text"] if text_result != "": result.append(text_result) partial_result = "" else: partial_result = json.loads(rec.PartialResult())["partial"] + " " if len(result) > 0: output_text = ",".join(result) + "," + partial_result else: output_text = partial_result return (rec, result), output_text def when_model_selected(model_id: str): model_config = models_config[model_id] if "dialect_mapping" not in model_config: return gr.update(visible=False) dialect_drop_down_choices = [ (k, v) for k, v in model_config["dialect_mapping"].items() ] return gr.update( choices=dialect_drop_down_choices, value=dialect_drop_down_choices[0][1], visible=True, ) demo = gr.Blocks( title="臺灣客語語音辨識系統", css="@import url(https://tauhu.tw/tauhu-oo.css);", theme=gr.themes.Default( font=( "tauhu-oo", gr.themes.GoogleFont("Source Sans Pro"), "ui-sans-serif", "system-ui", "sans-serif", ) ), ) with demo: default_model_id = list(models_config.keys())[0] model_drop_down = gr.Dropdown( models_config.keys(), value=default_model_id, label="模型", ) dialect_drop_down = gr.Dropdown( choices=[ (k, v) for k, v in models_config[default_model_id]["dialect_mapping"].items() ], value=list(models_config[default_model_id]["dialect_mapping"].values())[0], label="腔調", ) model_drop_down.input( when_model_selected, inputs=[model_drop_down], outputs=[dialect_drop_down], ) gr.Markdown( """ # 臺灣客語語音辨識系統 ### Taiwanese Hakka Automatic-Speech-Recognition System ### 研發 - **[李鴻欣 Hung-Shin Lee](mailto:hungshinlee@gmail.com)([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))** - **[陳力瑋 Li-Wei Chen](mailto:wayne900619@gmail.com)([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))** ### 合作單位 - **[國立聯合大學智慧客家實驗室](https://www.gohakka.org)** """ ) state = gr.State() audio = gr.Audio( label="錄音", type="numpy", format="wav", waveform_options=gr.WaveformOptions( sample_rate=16000, ), sources=["microphone"], streaming=True, ) gr.Interface( automatic_speech_recognition, inputs=[ model_drop_down, dialect_drop_down, state, audio, ], outputs=[ state, gr.Text(interactive=False, label="客語漢字"), ], live=True, stream_every=0.25, clear_btn=None, # flagging_mode="auto", ) demo.launch()