Spaces:

mispeech
/

MiDashengLM-7B

Running on L40S

App Files Files Community

frankenliu commited on 13 days ago

Commit

1d9763f

verified ·

1 Parent(s): 44947e6

page ui update (#8)

Browse files

- track json files (2afc9d8c207fcf936be3bfc4a891e241842e4973)
- add example json (c9f22e25aa2df3556219ef0a9274f5e7c5496f8f)
- reconstruct the demo page layout (4b383c25d0b3869fa9cd612d1ed8a3e42dd07177)

Files changed (3) hide show

.gitattributes +1 -0
app.py +47 -56
resources/examples.json +3 -0

.gitattributes CHANGED Viewed

@@ -37,4 +37,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.flac filter=lfs diff=lfs merge=lfs -text
 *.mp3 filter=lfs diff=lfs merge=lfs -text

 *.wav filter=lfs diff=lfs merge=lfs -text
 *.flac filter=lfs diff=lfs merge=lfs -text
 *.mp3 filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import shutil
 import time
 import gradio as gr
@@ -8,18 +9,6 @@ import torchaudio
 import soundfile as sf
 from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
-# tasks demonstrated on the demo web
-tasks_dict = {
-    "Speaker Timbre Analysis": {"prompt": "Write an audio caption describing the speaker's timbre.",
-                                "audios": ["resources/speaker-timbre-analysis/example1.flac", "resources/speaker-timbre-analysis/example2.flac", "resources/speaker-timbre-analysis/example3.flac"]},
-    "Speaker Language Analysis": {"prompt": "请描述说话人的语言特性, 包括说话人的语种, 口音等.",
-                                  "audios": ["resources/speaker-language-analysis/example1.mp3", "resources/speaker-language-analysis/example2.mp3", "resources/speaker-language-analysis/example3.mp3"]},
-    "Environmental Sound Recognition (multi-label)": {"prompt": "Which labels describe the sound?",
-                                                      "audios": ["resources/environmental-sound-recogntion(multi-label)/example1.wav", "resources/environmental-sound-recogntion(multi-label)/example2.wav", "resources/environmental-sound-recogntion(multi-label)/example3.wav"]},
-    "Music Instrument Recognition (single-label)": {"prompt": "What's the music instrument?",
-                                                    "audios": ["resources/music-instrument-recognition(single-label)/example1.wav", "resources/music-instrument-recognition(single-label)/example2.wav", "resources/music-instrument-recognition(single-label)/example3.wav"]},
-}
 def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
     model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
@@ -27,6 +16,11 @@ def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
     model.eval()
     return model, processor, tokenizer
 def _handle_request(text_input: str, audio_input: str, sample_rate: int = 16000):
     request_id = os.urandom(16).hex()
     out_dir = f"resources/usr/{request_id}"
@@ -77,9 +71,10 @@ def _prepare_text(text_input: str, audio_path: str):
     return message
 def infer(text_input, audio_input):
     audio_path, text_path = _handle_request(text_input, audio_input)
     message = _prepare_text(text_input, audio_path)
     print(f"Input message is :\n{message}")
     start_time = time.perf_counter()
@@ -113,33 +108,23 @@ def _check_inputs(text_input, audio_input):
         return True
     return False
-def _task_dispatcher(choice, text_input, audio_input):
-    """
-    different tasks may need different ways of handling
-    """
-    print(f"Task is {choice}")
-    if not _check_inputs(text_input, audio_input):
-        raise gr.Error("Invalid inputs!")
-    response = infer(text_input, audio_input)
-    return response
-def _update_inputs(choice):
-    """
-    update the default prompt and example audio when task_dropdown changes
-    """
-    task_info = tasks_dict.get(choice, {})
-    prompt = task_info.get("prompt", "")
-    audios = task_info.get("audios", [])
-    default_audio = audios[0] if audios else None
-    return prompt, gr.Dropdown(choices=audios, value=default_audio), default_audio
 def _update_audio_input(choice):
     """
     update audio player when audio_dropdown changes
     """
-    return choice
 def _disable_button():
     return gr.update(value="Processing...", interactive=False)
@@ -149,41 +134,47 @@ def _enable_button():
 if __name__ == "__main__":
     model_name = "mispeech/MiDashengLM-HF-dev"
     model, processor, tokenizer = _load_model(model_name)
     with gr.Blocks() as demo:
         gr.Markdown("# 🪄 Select an example or upload your own audio")
-        """Task selection"""
-        with gr.Row():
-            task_dropdown = gr.Dropdown(
-                    choices=list(tasks_dict.keys()),
-                    label="📋 Task",
-            )
         """Inputs fill in"""
         with gr.Row(equal_height=True):
             text_input = gr.Textbox(
                 label="✍️ Prompt",
-                value=tasks_dict[list(tasks_dict.keys())[0]]["prompt"],
-                lines=13,
-                max_lines=13)
             """Audio examples selection"""
             with gr.Column():
                 audio_dropdown = gr.Dropdown(
-                    choices=tasks_dict[list(tasks_dict.keys())[0]]["audios"],
                     label="🎶 Audio",
                 )
-                audio_input = gr.Audio(label="Audio Wave", value=tasks_dict[list(tasks_dict.keys())[0]]["audios"][0], type="filepath")
-        task_dropdown.change(
-            fn=_update_inputs,
-            inputs=task_dropdown,
-            outputs=[text_input, audio_dropdown, audio_input])
         audio_dropdown.change(
             fn=_update_audio_input,
             inputs=audio_dropdown,
-            outputs=audio_input,
         )
         submit_button = gr.Button(value="Submit", variant="primary")
@@ -195,8 +186,8 @@ if __name__ == "__main__":
             inputs=None,
             outputs=submit_button
         ).then(
-            fn=_task_dispatcher,
-            inputs=[task_dropdown, text_input, audio_input],
             outputs=text_output
         ).then(
             fn=_enable_button,

 import os
 import shutil
+import json
 import time
 import gradio as gr
 import soundfile as sf
 from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 def _load_model(model_name: str = "mispeech/MiDashengLM-HF-dev"):
     model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
     model.eval()
     return model, processor, tokenizer
+def _load_examples(file_path: str):
+    with open(file_path, mode='r', encoding="utf-8") as fp:
+        data_list = json.load(fp)
+    return data_list
 def _handle_request(text_input: str, audio_input: str, sample_rate: int = 16000):
     request_id = os.urandom(16).hex()
     out_dir = f"resources/usr/{request_id}"
     return message
 def infer(text_input, audio_input):
+    if not _check_inputs(text_input, audio_input):
+        raise gr.Error("Invalid inputs!")
     audio_path, text_path = _handle_request(text_input, audio_input)
     message = _prepare_text(text_input, audio_path)
     print(f"Input message is :\n{message}")
     start_time = time.perf_counter()
         return True
     return False
 def _update_audio_input(choice):
     """
     update audio player when audio_dropdown changes
     """
+    prompts = audio_to_prompts.get(choice, [])
+    prompt = prompts[0] if prompts else ""
+    return (
+        gr.update(choices=prompts, value=prompt),
+        choice,
+        prompt,
+    )
+def _update_text_input(choice):
+    """
+    update input prompt when example_prompts_dropdown changes
+    """
+    return gr.update(value=choice)
 def _disable_button():
     return gr.update(value="Processing...", interactive=False)
 if __name__ == "__main__":
     model_name = "mispeech/MiDashengLM-HF-dev"
+    examples_path = "resources/examples.json"
     model, processor, tokenizer = _load_model(model_name)
+    data_list = _load_examples(examples_path)
+    audio_list = [item["audio"] for item in data_list]
+    audio_to_prompts = {item["audio"]: item["prompts"] for item in data_list}
     with gr.Blocks() as demo:
         gr.Markdown("# 🪄 Select an example or upload your own audio")
         """Inputs fill in"""
         with gr.Row(equal_height=True):
             text_input = gr.Textbox(
                 label="✍️ Prompt",
+                value=data_list[0]["prompts"][0],
+                lines=17,
+                max_lines=17)
             """Audio examples selection"""
             with gr.Column():
                 audio_dropdown = gr.Dropdown(
+                    choices=audio_list,
+                    value=audio_list[0],
                     label="🎶 Audio",
+                    interactive=True,
                 )
+                example_prompts_dropdown = gr.Dropdown(
+                    choices=audio_to_prompts[audio_list[0]],
+                    value=audio_to_prompts[audio_list[0]][0],
+                    label="💬 Example prompt",
+                    interactive=True,
+                )
+                audio_input = gr.Audio(label="Audio Wave", value=audio_list[0], type="filepath")
         audio_dropdown.change(
             fn=_update_audio_input,
             inputs=audio_dropdown,
+            outputs=[example_prompts_dropdown, audio_input, text_input],
+        )
+        example_prompts_dropdown.change(
+            fn=_update_text_input,
+            inputs=example_prompts_dropdown,
+            outputs=text_input,
         )
         submit_button = gr.Button(value="Submit", variant="primary")
             inputs=None,
             outputs=submit_button
         ).then(
+            fn=infer,
+            inputs=[text_input, audio_input],
             outputs=text_output
         ).then(
             fn=_enable_button,

resources/examples.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f4e9e74d2633029f7bdb308a0fc7e43b4c90aaabc82bd572537675b5593fd19
+size 3000