|
import gradio as gr |
|
import numpy as np |
|
from audioldm import text_to_audio, build_model |
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm = build_model() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text2audio(text, duration, guidance_scale, random_seed, n_candidates): |
|
|
|
waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) |
|
waveform = [(16000, wave[0]) for wave in waveform] |
|
|
|
if(len(waveform) == 1): |
|
return waveform[0] |
|
else: |
|
return waveform |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Blocks() |
|
|
|
with iface: |
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px;"> |
|
Text-to-Audio Generation with AudioLDM |
|
</h1> |
|
</div> |
|
<p style="margin-bottom: 10px; font-size: 94%"> |
|
<a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project page]</a> |
|
</p> |
|
</div> |
|
""" |
|
) |
|
with gr.Group(): |
|
with gr.Box(): |
|
|
|
textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1) |
|
|
|
with gr.Accordion("Click to modify detailed configurations", open=False): |
|
seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.") |
|
duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)") |
|
guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)") |
|
n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation") |
|
|
|
outputs=[gr.Audio(label="Output", type="numpy")] |
|
|
|
|
|
btn = gr.Button("Submit").style(full_width=True) |
|
btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs) |
|
gr.HTML(''' |
|
<hr> |
|
<div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<p>Model by <a href="https://haoheliu.github.io/" style="text-decoration: underline;" target="_blank">Haohe Liu</a> |
|
</p> |
|
</div> |
|
''') |
|
|
|
iface.queue(concurrency_count = 2) |
|
iface.launch(debug=True) |
|
|