Spaces:
Running
on
L40S
Running
on
L40S
page ui update (#8)
Browse files- track json files (2afc9d8c207fcf936be3bfc4a891e241842e4973)
- add example json (c9f22e25aa2df3556219ef0a9274f5e7c5496f8f)
- reconstruct the demo page layout (4b383c25d0b3869fa9cd612d1ed8a3e42dd07177)
- .gitattributes +1 -0
- app.py +47 -56
- resources/examples.json +3 -0
.gitattributes
CHANGED
@@ -37,4 +37,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
37 |
*.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
*.flac filter=lfs diff=lfs merge=lfs -text
|
39 |
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
|
|
40 |
|
|
|
37 |
*.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
*.flac filter=lfs diff=lfs merge=lfs -text
|
39 |
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
41 |
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import shutil
|
|
|
3 |
import time
|
4 |
import gradio as gr
|
5 |
|
@@ -8,18 +9,6 @@ import torchaudio
|
|
8 |
import soundfile as sf
|
9 |
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
|
10 |
|
11 |
-
# tasks demonstrated on the demo web
|
12 |
-
tasks_dict = {
|
13 |
-
"Speaker Timbre Analysis": {"prompt": "Write an audio caption describing the speaker's timbre.",
|
14 |
-
"audios": ["resources/speaker-timbre-analysis/example1.flac", "resources/speaker-timbre-analysis/example2.flac", "resources/speaker-timbre-analysis/example3.flac"]},
|
15 |
-
"Speaker Language Analysis": {"prompt": "请描述说话人的语言特性, 包括说话人的语种, 口音等.",
|
16 |
-
"audios": ["resources/speaker-language-analysis/example1.mp3", "resources/speaker-language-analysis/example2.mp3", "resources/speaker-language-analysis/example3.mp3"]},
|
17 |
-
"Environmental Sound Recognition (multi-label)": {"prompt": "Which labels describe the sound?",
|
18 |
-
"audios": ["resources/environmental-sound-recogntion(multi-label)/example1.wav", "resources/environmental-sound-recogntion(multi-label)/example2.wav", "resources/environmental-sound-recogntion(multi-label)/example3.wav"]},
|
19 |
-
"Music Instrument Recognition (single-label)": {"prompt": "What's the music instrument?",
|
20 |
-
"audios": ["resources/music-instrument-recognition(single-label)/example1.wav", "resources/music-instrument-recognition(single-label)/example2.wav", "resources/music-instrument-recognition(single-label)/example3.wav"]},
|
21 |
-
}
|
22 |
-
|
23 |
def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
|
24 |
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
25 |
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
@@ -27,6 +16,11 @@ def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
|
|
27 |
model.eval()
|
28 |
return model, processor, tokenizer
|
29 |
|
|
|
|
|
|
|
|
|
|
|
30 |
def _handle_request(text_input: str, audio_input: str, sample_rate: int = 16000):
|
31 |
request_id = os.urandom(16).hex()
|
32 |
out_dir = f"resources/usr/{request_id}"
|
@@ -77,9 +71,10 @@ def _prepare_text(text_input: str, audio_path: str):
|
|
77 |
return message
|
78 |
|
79 |
def infer(text_input, audio_input):
|
|
|
|
|
80 |
audio_path, text_path = _handle_request(text_input, audio_input)
|
81 |
|
82 |
-
|
83 |
message = _prepare_text(text_input, audio_path)
|
84 |
print(f"Input message is :\n{message}")
|
85 |
start_time = time.perf_counter()
|
@@ -113,33 +108,23 @@ def _check_inputs(text_input, audio_input):
|
|
113 |
return True
|
114 |
return False
|
115 |
|
116 |
-
def _task_dispatcher(choice, text_input, audio_input):
|
117 |
-
"""
|
118 |
-
different tasks may need different ways of handling
|
119 |
-
"""
|
120 |
-
print(f"Task is {choice}")
|
121 |
-
if not _check_inputs(text_input, audio_input):
|
122 |
-
raise gr.Error("Invalid inputs!")
|
123 |
-
|
124 |
-
response = infer(text_input, audio_input)
|
125 |
-
|
126 |
-
return response
|
127 |
-
|
128 |
-
def _update_inputs(choice):
|
129 |
-
"""
|
130 |
-
update the default prompt and example audio when task_dropdown changes
|
131 |
-
"""
|
132 |
-
task_info = tasks_dict.get(choice, {})
|
133 |
-
prompt = task_info.get("prompt", "")
|
134 |
-
audios = task_info.get("audios", [])
|
135 |
-
default_audio = audios[0] if audios else None
|
136 |
-
return prompt, gr.Dropdown(choices=audios, value=default_audio), default_audio
|
137 |
-
|
138 |
def _update_audio_input(choice):
|
139 |
"""
|
140 |
update audio player when audio_dropdown changes
|
141 |
"""
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
def _disable_button():
|
145 |
return gr.update(value="Processing...", interactive=False)
|
@@ -149,41 +134,47 @@ def _enable_button():
|
|
149 |
|
150 |
if __name__ == "__main__":
|
151 |
model_name = "mispeech/MiDashengLM-HF-dev"
|
|
|
152 |
model, processor, tokenizer = _load_model(model_name)
|
|
|
|
|
|
|
153 |
|
154 |
with gr.Blocks() as demo:
|
155 |
gr.Markdown("# 🪄 Select an example or upload your own audio")
|
156 |
|
157 |
-
"""Task selection"""
|
158 |
-
with gr.Row():
|
159 |
-
task_dropdown = gr.Dropdown(
|
160 |
-
choices=list(tasks_dict.keys()),
|
161 |
-
label="📋 Task",
|
162 |
-
)
|
163 |
"""Inputs fill in"""
|
164 |
with gr.Row(equal_height=True):
|
165 |
text_input = gr.Textbox(
|
166 |
label="✍️ Prompt",
|
167 |
-
value=
|
168 |
-
lines=
|
169 |
-
max_lines=
|
170 |
"""Audio examples selection"""
|
171 |
with gr.Column():
|
172 |
audio_dropdown = gr.Dropdown(
|
173 |
-
choices=
|
|
|
174 |
label="🎶 Audio",
|
|
|
175 |
)
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
|
|
183 |
audio_dropdown.change(
|
184 |
fn=_update_audio_input,
|
185 |
inputs=audio_dropdown,
|
186 |
-
outputs=audio_input,
|
|
|
|
|
|
|
|
|
|
|
187 |
)
|
188 |
|
189 |
submit_button = gr.Button(value="Submit", variant="primary")
|
@@ -195,8 +186,8 @@ if __name__ == "__main__":
|
|
195 |
inputs=None,
|
196 |
outputs=submit_button
|
197 |
).then(
|
198 |
-
fn=
|
199 |
-
inputs=[
|
200 |
outputs=text_output
|
201 |
).then(
|
202 |
fn=_enable_button,
|
|
|
1 |
import os
|
2 |
import shutil
|
3 |
+
import json
|
4 |
import time
|
5 |
import gradio as gr
|
6 |
|
|
|
9 |
import soundfile as sf
|
10 |
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
|
13 |
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
14 |
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
|
|
16 |
model.eval()
|
17 |
return model, processor, tokenizer
|
18 |
|
19 |
+
def _load_examples(file_path: str):
|
20 |
+
with open(file_path, mode='r', encoding="utf-8") as fp:
|
21 |
+
data_list = json.load(fp)
|
22 |
+
return data_list
|
23 |
+
|
24 |
def _handle_request(text_input: str, audio_input: str, sample_rate: int = 16000):
|
25 |
request_id = os.urandom(16).hex()
|
26 |
out_dir = f"resources/usr/{request_id}"
|
|
|
71 |
return message
|
72 |
|
73 |
def infer(text_input, audio_input):
|
74 |
+
if not _check_inputs(text_input, audio_input):
|
75 |
+
raise gr.Error("Invalid inputs!")
|
76 |
audio_path, text_path = _handle_request(text_input, audio_input)
|
77 |
|
|
|
78 |
message = _prepare_text(text_input, audio_path)
|
79 |
print(f"Input message is :\n{message}")
|
80 |
start_time = time.perf_counter()
|
|
|
108 |
return True
|
109 |
return False
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
def _update_audio_input(choice):
|
112 |
"""
|
113 |
update audio player when audio_dropdown changes
|
114 |
"""
|
115 |
+
prompts = audio_to_prompts.get(choice, [])
|
116 |
+
prompt = prompts[0] if prompts else ""
|
117 |
+
return (
|
118 |
+
gr.update(choices=prompts, value=prompt),
|
119 |
+
choice,
|
120 |
+
prompt,
|
121 |
+
)
|
122 |
+
|
123 |
+
def _update_text_input(choice):
|
124 |
+
"""
|
125 |
+
update input prompt when example_prompts_dropdown changes
|
126 |
+
"""
|
127 |
+
return gr.update(value=choice)
|
128 |
|
129 |
def _disable_button():
|
130 |
return gr.update(value="Processing...", interactive=False)
|
|
|
134 |
|
135 |
if __name__ == "__main__":
|
136 |
model_name = "mispeech/MiDashengLM-HF-dev"
|
137 |
+
examples_path = "resources/examples.json"
|
138 |
model, processor, tokenizer = _load_model(model_name)
|
139 |
+
data_list = _load_examples(examples_path)
|
140 |
+
audio_list = [item["audio"] for item in data_list]
|
141 |
+
audio_to_prompts = {item["audio"]: item["prompts"] for item in data_list}
|
142 |
|
143 |
with gr.Blocks() as demo:
|
144 |
gr.Markdown("# 🪄 Select an example or upload your own audio")
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
"""Inputs fill in"""
|
147 |
with gr.Row(equal_height=True):
|
148 |
text_input = gr.Textbox(
|
149 |
label="✍️ Prompt",
|
150 |
+
value=data_list[0]["prompts"][0],
|
151 |
+
lines=17,
|
152 |
+
max_lines=17)
|
153 |
"""Audio examples selection"""
|
154 |
with gr.Column():
|
155 |
audio_dropdown = gr.Dropdown(
|
156 |
+
choices=audio_list,
|
157 |
+
value=audio_list[0],
|
158 |
label="🎶 Audio",
|
159 |
+
interactive=True,
|
160 |
)
|
161 |
+
example_prompts_dropdown = gr.Dropdown(
|
162 |
+
choices=audio_to_prompts[audio_list[0]],
|
163 |
+
value=audio_to_prompts[audio_list[0]][0],
|
164 |
+
label="💬 Example prompt",
|
165 |
+
interactive=True,
|
166 |
+
)
|
167 |
+
audio_input = gr.Audio(label="Audio Wave", value=audio_list[0], type="filepath")
|
168 |
+
|
169 |
audio_dropdown.change(
|
170 |
fn=_update_audio_input,
|
171 |
inputs=audio_dropdown,
|
172 |
+
outputs=[example_prompts_dropdown, audio_input, text_input],
|
173 |
+
)
|
174 |
+
example_prompts_dropdown.change(
|
175 |
+
fn=_update_text_input,
|
176 |
+
inputs=example_prompts_dropdown,
|
177 |
+
outputs=text_input,
|
178 |
)
|
179 |
|
180 |
submit_button = gr.Button(value="Submit", variant="primary")
|
|
|
186 |
inputs=None,
|
187 |
outputs=submit_button
|
188 |
).then(
|
189 |
+
fn=infer,
|
190 |
+
inputs=[text_input, audio_input],
|
191 |
outputs=text_output
|
192 |
).then(
|
193 |
fn=_enable_button,
|
resources/examples.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f4e9e74d2633029f7bdb308a0fc7e43b4c90aaabc82bd572537675b5593fd19
|
3 |
+
size 3000
|