txya900619's picture
feat: add app.py
a3effe4
raw
history blame
4.04 kB
import json
import gradio as gr
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model
def load_vosk(model_id: str):
model_dir = snapshot_download(model_id)
return Model(model_path=model_dir)
OmegaConf.register_new_resolver("load_vosk", load_vosk)
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
def automatic_speech_recognition(
model_id: str, dialect_id: str, stream: str, new_chunk: str
):
if isinstance(models_config[model_id]["model"], dict):
model = models_config[model_id]["model"][dialect_id]
else:
model = models_config[model_id]["model"]
sample_rate, audio_array = new_chunk
if audio_array.ndim == 2:
audio_array = audio_array[:, 0]
audio_bytes = audio_array.tobytes()
if stream is None:
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)
result = []
else:
rec, result = stream
if rec.AcceptWaveform(audio_bytes):
text_result = json.loads(rec.Result())["text"]
if text_result != "":
result.append(text_result)
partial_result = ""
else:
partial_result = json.loads(rec.PartialResult())["partial"] + " "
if len(result) > 0:
output_text = ",".join(result) + "," + partial_result
else:
output_text = partial_result
return (rec, result), output_text
def when_model_selected(model_id: str):
model_config = models_config[model_id]
if "dialect_mapping" not in model_config:
return gr.update(visible=False)
dialect_drop_down_choices = [
(k, v) for k, v in model_config["dialect_mapping"].items()
]
return gr.update(
choices=dialect_drop_down_choices,
value=dialect_drop_down_choices[0][1],
visible=True,
)
demo = gr.Blocks(
title="臺灣客語語音辨識系統",
css="@import url(https://tauhu.tw/tauhu-oo.css);",
theme=gr.themes.Default(
font=(
"tauhu-oo",
gr.themes.GoogleFont("Source Sans Pro"),
"ui-sans-serif",
"system-ui",
"sans-serif",
)
),
)
with demo:
default_model_id = list(models_config.keys())[0]
model_drop_down = gr.Dropdown(
models_config.keys(),
value=default_model_id,
label="模型",
)
dialect_drop_down = gr.Dropdown(
choices=[
(k, v)
for k, v in models_config[default_model_id]["dialect_mapping"].items()
],
value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
label="腔調",
)
model_drop_down.input(
when_model_selected,
inputs=[model_drop_down],
outputs=[dialect_drop_down],
)
gr.Markdown(
"""
# 臺灣客語語音辨識系統
### Taiwanese Hakka Automatic-Speech-Recognition System
### 研發
- **[李鴻欣 Hung-Shin Lee](mailto:[email protected])([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))**
- **[陳力瑋 Li-Wei Chen](mailto:[email protected])([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))**
### 合作單位
- **[國立聯合大學智慧客家實驗室](https://www.gohakka.org)**
"""
)
state = gr.State()
audio = gr.Audio(
label="錄音",
type="numpy",
format="wav",
waveform_options=gr.WaveformOptions(
sample_rate=16000,
),
sources=["microphone"],
streaming=True,
)
gr.Interface(
automatic_speech_recognition,
inputs=[
model_drop_down,
dialect_drop_down,
state,
audio,
],
outputs=[
state,
gr.Text(interactive=False, label="客語漢字"),
],
live=True,
stream_every=0.25,
clear_btn=None,
# flagging_mode="auto",
)
demo.launch()