update
Browse files
app.py
CHANGED
|
@@ -111,6 +111,7 @@ def generate_audio_gradio(
|
|
| 111 |
cfg_strength,
|
| 112 |
num_steps,
|
| 113 |
variant,
|
|
|
|
| 114 |
):
|
| 115 |
|
| 116 |
if duration <= 0 or num_steps <= 0:
|
|
@@ -146,8 +147,7 @@ def generate_audio_gradio(
|
|
| 146 |
sampler_arg_name = "fm"
|
| 147 |
|
| 148 |
rng = torch.Generator(device=device)
|
| 149 |
-
|
| 150 |
-
rng.manual_seed(42)
|
| 151 |
|
| 152 |
audios = generation_func(
|
| 153 |
[prompt]*NUM_SAMPLE,
|
|
@@ -167,7 +167,7 @@ def generate_audio_gradio(
|
|
| 167 |
|
| 168 |
for i, audio in enumerate(audios):
|
| 169 |
audio = audio.float().cpu()
|
| 170 |
-
audio = fade_out(audio, seq_cfg.sampling_rate)
|
| 171 |
|
| 172 |
current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
| 173 |
filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
|
|
@@ -188,7 +188,7 @@ output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
|
| 188 |
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
|
| 189 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
|
| 190 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
| 191 |
-
|
| 192 |
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
| 193 |
|
| 194 |
|
|
@@ -214,27 +214,26 @@ description_text = """
|
|
| 214 |
|
| 215 |
gr_interface = gr.Interface(
|
| 216 |
fn=generate_audio_gradio,
|
| 217 |
-
inputs=[input_text, duration, cfg_strength, denoising_steps, variant],
|
| 218 |
outputs=[
|
| 219 |
gr.Audio(label="🎵 Audio Sample", type="filepath"),
|
| 220 |
gr.Textbox(label="Prompt Used", interactive=False)
|
| 221 |
],
|
| 222 |
title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
|
| 223 |
-
description=
|
| 224 |
flagging_mode="never",
|
| 225 |
examples=[
|
| 226 |
-
["
|
| 227 |
["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
|
| 228 |
-
["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full"],
|
| 229 |
-
["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full"],
|
| 230 |
-
["
|
| 231 |
-
["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full"],
|
| 232 |
-
["
|
| 233 |
-
["A
|
| 234 |
-
["
|
| 235 |
-
["
|
| 236 |
-
[
|
| 237 |
-
["A fork scrapes a plate, water drips slowly into a sink, and the faint hum of a refrigerator lingers in the background", 10, 3, 1, "meanaudio_s_full"]
|
| 238 |
],
|
| 239 |
cache_examples="lazy",
|
| 240 |
)
|
|
|
|
| 111 |
cfg_strength,
|
| 112 |
num_steps,
|
| 113 |
variant,
|
| 114 |
+
seed
|
| 115 |
):
|
| 116 |
|
| 117 |
if duration <= 0 or num_steps <= 0:
|
|
|
|
| 147 |
sampler_arg_name = "fm"
|
| 148 |
|
| 149 |
rng = torch.Generator(device=device)
|
| 150 |
+
rng.manual_seed(seed)
|
|
|
|
| 151 |
|
| 152 |
audios = generation_func(
|
| 153 |
[prompt]*NUM_SAMPLE,
|
|
|
|
| 167 |
|
| 168 |
for i, audio in enumerate(audios):
|
| 169 |
audio = audio.float().cpu()
|
| 170 |
+
audio = fade_out(audio, seq_cfg.sampling_rate, fade_ms=100)
|
| 171 |
|
| 172 |
current_time_string = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
| 173 |
filename = f"{safe_prompt}_{current_time_string}_{i}.flac"
|
|
|
|
| 188 |
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
|
| 189 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
|
| 190 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
| 191 |
+
seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
|
| 192 |
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
| 193 |
|
| 194 |
|
|
|
|
| 214 |
|
| 215 |
gr_interface = gr.Interface(
|
| 216 |
fn=generate_audio_gradio,
|
| 217 |
+
inputs=[input_text, duration, cfg_strength, denoising_steps, variant, seed],
|
| 218 |
outputs=[
|
| 219 |
gr.Audio(label="🎵 Audio Sample", type="filepath"),
|
| 220 |
gr.Textbox(label="Prompt Used", interactive=False)
|
| 221 |
],
|
| 222 |
title="MeanAudio: Fast and Faithful Text-to-Audio Generation with Mean Flows",
|
| 223 |
+
description=description_text,
|
| 224 |
flagging_mode="never",
|
| 225 |
examples=[
|
| 226 |
+
["Guitar and piano playing a warm music, with a soft and gentle melody, perfect for a romantic evening.", 10, 3, 1, "meanaudio_s_full", 42],
|
| 227 |
["Melodic human whistling harmonizing with natural birdsong", 10, 3, 1, "meanaudio_s_full"],
|
| 228 |
+
["A parade marches through a town square, with drumbeats pounding, children clapping, and a horse neighing amidst the commotion", 10, 3, 1, "meanaudio_s_full", 42],
|
| 229 |
+
["Quiet speech and then and airplane flying away", 10, 3, 1, "meanaudio_s_full", 42],
|
| 230 |
+
["The sound of a steam engine.", 10, 3, 1, "meanaudio_s_full", 42],
|
| 231 |
+
["A basketball bounces rhythmically on a court, shoes squeak against the floor, and a referee’s whistle cuts through the air", 10, 3, 1, "meanaudio_s_full", 42],
|
| 232 |
+
["Chopping meat on a wooden table.", 10, 3, 1, "meanaudio_s_full", 42],
|
| 233 |
+
["A vehicle engine revving then accelerating at a high rate as a metal surface is whipped followed by tires skidding.", 10, 3, 1, "meanaudio_s_full", 42],
|
| 234 |
+
["Battlefield scene, continuous roar of artillery and gunfire, high fidelity, the sharp crack of bullets, the thundering explosions of bombs, and the screams of wounded soldiers.", 10, 3, 1, "meanaudio_s_full", 42],
|
| 235 |
+
["Pop music that upbeat, catchy, and easy to listen, high fidelity, with simple melodies, electronic instruments and polished production.", 10, 3, 1, "meanaudio_s_full", 42],
|
| 236 |
+
["A fast-paced instrumental piece with a classical vibe featuring stringed instruments, evoking an energetic and uplifting mood.", 10, 3, 1, "meanaudio_s_full", 42]
|
|
|
|
| 237 |
],
|
| 238 |
cache_examples="lazy",
|
| 239 |
)
|