frankenliu commited on
Commit
1d9763f
·
verified ·
1 Parent(s): 44947e6

page ui update (#8)

Browse files

- track json files (2afc9d8c207fcf936be3bfc4a891e241842e4973)
- add example json (c9f22e25aa2df3556219ef0a9274f5e7c5496f8f)
- reconstruct the demo page layout (4b383c25d0b3869fa9cd612d1ed8a3e42dd07177)

Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +47 -56
  3. resources/examples.json +3 -0
.gitattributes CHANGED
@@ -37,4 +37,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
37
  *.wav filter=lfs diff=lfs merge=lfs -text
38
  *.flac filter=lfs diff=lfs merge=lfs -text
39
  *.mp3 filter=lfs diff=lfs merge=lfs -text
 
40
 
 
37
  *.wav filter=lfs diff=lfs merge=lfs -text
38
  *.flac filter=lfs diff=lfs merge=lfs -text
39
  *.mp3 filter=lfs diff=lfs merge=lfs -text
40
+ *.json filter=lfs diff=lfs merge=lfs -text
41
 
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import shutil
 
3
  import time
4
  import gradio as gr
5
 
@@ -8,18 +9,6 @@ import torchaudio
8
  import soundfile as sf
9
  from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
10
 
11
- # tasks demonstrated on the demo web
12
- tasks_dict = {
13
- "Speaker Timbre Analysis": {"prompt": "Write an audio caption describing the speaker's timbre.",
14
- "audios": ["resources/speaker-timbre-analysis/example1.flac", "resources/speaker-timbre-analysis/example2.flac", "resources/speaker-timbre-analysis/example3.flac"]},
15
- "Speaker Language Analysis": {"prompt": "请描述说话人的语言特性, 包括说话人的语种, 口音等.",
16
- "audios": ["resources/speaker-language-analysis/example1.mp3", "resources/speaker-language-analysis/example2.mp3", "resources/speaker-language-analysis/example3.mp3"]},
17
- "Environmental Sound Recognition (multi-label)": {"prompt": "Which labels describe the sound?",
18
- "audios": ["resources/environmental-sound-recogntion(multi-label)/example1.wav", "resources/environmental-sound-recogntion(multi-label)/example2.wav", "resources/environmental-sound-recogntion(multi-label)/example3.wav"]},
19
- "Music Instrument Recognition (single-label)": {"prompt": "What's the music instrument?",
20
- "audios": ["resources/music-instrument-recognition(single-label)/example1.wav", "resources/music-instrument-recognition(single-label)/example2.wav", "resources/music-instrument-recognition(single-label)/example3.wav"]},
21
- }
22
-
23
  def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
24
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
25
  processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
@@ -27,6 +16,11 @@ def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
27
  model.eval()
28
  return model, processor, tokenizer
29
 
 
 
 
 
 
30
  def _handle_request(text_input: str, audio_input: str, sample_rate: int = 16000):
31
  request_id = os.urandom(16).hex()
32
  out_dir = f"resources/usr/{request_id}"
@@ -77,9 +71,10 @@ def _prepare_text(text_input: str, audio_path: str):
77
  return message
78
 
79
  def infer(text_input, audio_input):
 
 
80
  audio_path, text_path = _handle_request(text_input, audio_input)
81
 
82
-
83
  message = _prepare_text(text_input, audio_path)
84
  print(f"Input message is :\n{message}")
85
  start_time = time.perf_counter()
@@ -113,33 +108,23 @@ def _check_inputs(text_input, audio_input):
113
  return True
114
  return False
115
 
116
- def _task_dispatcher(choice, text_input, audio_input):
117
- """
118
- different tasks may need different ways of handling
119
- """
120
- print(f"Task is {choice}")
121
- if not _check_inputs(text_input, audio_input):
122
- raise gr.Error("Invalid inputs!")
123
-
124
- response = infer(text_input, audio_input)
125
-
126
- return response
127
-
128
- def _update_inputs(choice):
129
- """
130
- update the default prompt and example audio when task_dropdown changes
131
- """
132
- task_info = tasks_dict.get(choice, {})
133
- prompt = task_info.get("prompt", "")
134
- audios = task_info.get("audios", [])
135
- default_audio = audios[0] if audios else None
136
- return prompt, gr.Dropdown(choices=audios, value=default_audio), default_audio
137
-
138
  def _update_audio_input(choice):
139
  """
140
  update audio player when audio_dropdown changes
141
  """
142
- return choice
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  def _disable_button():
145
  return gr.update(value="Processing...", interactive=False)
@@ -149,41 +134,47 @@ def _enable_button():
149
 
150
  if __name__ == "__main__":
151
  model_name = "mispeech/MiDashengLM-HF-dev"
 
152
  model, processor, tokenizer = _load_model(model_name)
 
 
 
153
 
154
  with gr.Blocks() as demo:
155
  gr.Markdown("# 🪄 Select an example or upload your own audio")
156
 
157
- """Task selection"""
158
- with gr.Row():
159
- task_dropdown = gr.Dropdown(
160
- choices=list(tasks_dict.keys()),
161
- label="📋 Task",
162
- )
163
  """Inputs fill in"""
164
  with gr.Row(equal_height=True):
165
  text_input = gr.Textbox(
166
  label="✍️ Prompt",
167
- value=tasks_dict[list(tasks_dict.keys())[0]]["prompt"],
168
- lines=13,
169
- max_lines=13)
170
  """Audio examples selection"""
171
  with gr.Column():
172
  audio_dropdown = gr.Dropdown(
173
- choices=tasks_dict[list(tasks_dict.keys())[0]]["audios"],
 
174
  label="🎶 Audio",
 
175
  )
176
- audio_input = gr.Audio(label="Audio Wave", value=tasks_dict[list(tasks_dict.keys())[0]]["audios"][0], type="filepath")
177
-
178
- task_dropdown.change(
179
- fn=_update_inputs,
180
- inputs=task_dropdown,
181
- outputs=[text_input, audio_dropdown, audio_input])
182
-
 
183
  audio_dropdown.change(
184
  fn=_update_audio_input,
185
  inputs=audio_dropdown,
186
- outputs=audio_input,
 
 
 
 
 
187
  )
188
 
189
  submit_button = gr.Button(value="Submit", variant="primary")
@@ -195,8 +186,8 @@ if __name__ == "__main__":
195
  inputs=None,
196
  outputs=submit_button
197
  ).then(
198
- fn=_task_dispatcher,
199
- inputs=[task_dropdown, text_input, audio_input],
200
  outputs=text_output
201
  ).then(
202
  fn=_enable_button,
 
1
  import os
2
  import shutil
3
+ import json
4
  import time
5
  import gradio as gr
6
 
 
9
  import soundfile as sf
10
  from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
13
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
14
  processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
16
  model.eval()
17
  return model, processor, tokenizer
18
 
19
+ def _load_examples(file_path: str):
20
+ with open(file_path, mode='r', encoding="utf-8") as fp:
21
+ data_list = json.load(fp)
22
+ return data_list
23
+
24
  def _handle_request(text_input: str, audio_input: str, sample_rate: int = 16000):
25
  request_id = os.urandom(16).hex()
26
  out_dir = f"resources/usr/{request_id}"
 
71
  return message
72
 
73
  def infer(text_input, audio_input):
74
+ if not _check_inputs(text_input, audio_input):
75
+ raise gr.Error("Invalid inputs!")
76
  audio_path, text_path = _handle_request(text_input, audio_input)
77
 
 
78
  message = _prepare_text(text_input, audio_path)
79
  print(f"Input message is :\n{message}")
80
  start_time = time.perf_counter()
 
108
  return True
109
  return False
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def _update_audio_input(choice):
112
  """
113
  update audio player when audio_dropdown changes
114
  """
115
+ prompts = audio_to_prompts.get(choice, [])
116
+ prompt = prompts[0] if prompts else ""
117
+ return (
118
+ gr.update(choices=prompts, value=prompt),
119
+ choice,
120
+ prompt,
121
+ )
122
+
123
+ def _update_text_input(choice):
124
+ """
125
+ update input prompt when example_prompts_dropdown changes
126
+ """
127
+ return gr.update(value=choice)
128
 
129
  def _disable_button():
130
  return gr.update(value="Processing...", interactive=False)
 
134
 
135
  if __name__ == "__main__":
136
  model_name = "mispeech/MiDashengLM-HF-dev"
137
+ examples_path = "resources/examples.json"
138
  model, processor, tokenizer = _load_model(model_name)
139
+ data_list = _load_examples(examples_path)
140
+ audio_list = [item["audio"] for item in data_list]
141
+ audio_to_prompts = {item["audio"]: item["prompts"] for item in data_list}
142
 
143
  with gr.Blocks() as demo:
144
  gr.Markdown("# 🪄 Select an example or upload your own audio")
145
 
 
 
 
 
 
 
146
  """Inputs fill in"""
147
  with gr.Row(equal_height=True):
148
  text_input = gr.Textbox(
149
  label="✍️ Prompt",
150
+ value=data_list[0]["prompts"][0],
151
+ lines=17,
152
+ max_lines=17)
153
  """Audio examples selection"""
154
  with gr.Column():
155
  audio_dropdown = gr.Dropdown(
156
+ choices=audio_list,
157
+ value=audio_list[0],
158
  label="🎶 Audio",
159
+ interactive=True,
160
  )
161
+ example_prompts_dropdown = gr.Dropdown(
162
+ choices=audio_to_prompts[audio_list[0]],
163
+ value=audio_to_prompts[audio_list[0]][0],
164
+ label="💬 Example prompt",
165
+ interactive=True,
166
+ )
167
+ audio_input = gr.Audio(label="Audio Wave", value=audio_list[0], type="filepath")
168
+
169
  audio_dropdown.change(
170
  fn=_update_audio_input,
171
  inputs=audio_dropdown,
172
+ outputs=[example_prompts_dropdown, audio_input, text_input],
173
+ )
174
+ example_prompts_dropdown.change(
175
+ fn=_update_text_input,
176
+ inputs=example_prompts_dropdown,
177
+ outputs=text_input,
178
  )
179
 
180
  submit_button = gr.Button(value="Submit", variant="primary")
 
186
  inputs=None,
187
  outputs=submit_button
188
  ).then(
189
+ fn=infer,
190
+ inputs=[text_input, audio_input],
191
  outputs=text_output
192
  ).then(
193
  fn=_enable_button,
resources/examples.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f4e9e74d2633029f7bdb308a0fc7e43b4c90aaabc82bd572537675b5593fd19
3
+ size 3000