|
import gradio as gr |
|
import torch, os |
|
import wave |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
from PIL import Image |
|
import matplotlib.pyplot as plt |
|
from huggingface_hub import snapshot_download |
|
import soundfile as sf |
|
from auffusion_pipeline import AuffusionPipeline |
|
|
|
|
|
|
|
from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline |
|
from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio |
|
from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image |
|
|
|
|
|
def convert_wav_to_16khz(input_path, output_path): |
|
with wave.open(input_path, "rb") as wav_in: |
|
params = wav_in.getparams() |
|
channels, sampwidth, framerate, nframes = params[:4] |
|
|
|
|
|
audio_data = np.frombuffer(wav_in.readframes(nframes), dtype=np.int16) |
|
new_framerate = 16000 |
|
|
|
|
|
write(output_path, new_framerate, audio_data) |
|
return output_path |
|
|
|
def save_spectrogram_image(spectrogram, filename): |
|
"""Save a spectrogram as an image.""" |
|
plt.figure(figsize=(10, 4)) |
|
plt.imshow(spectrogram.squeeze(), aspect='auto', origin='lower', cmap='magma') |
|
plt.axis('off') |
|
plt.savefig(filename, bbox_inches='tight', pad_inches=0) |
|
plt.close() |
|
|
|
def infer(prompt, progress=gr.Progress(track_tqdm=True)): |
|
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion") |
|
prompt = prompt |
|
output = pipeline(prompt=prompt) |
|
audio = output.audios[0] |
|
sf.write(f"{prompt}.wav", audio, samplerate=16000) |
|
|
|
return f"{prompt}.wav" |
|
|
|
def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)): |
|
|
|
audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav") |
|
|
|
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" |
|
dtype = torch.float16 |
|
device = "cuda" |
|
|
|
if not os.path.isdir(pretrained_model_name_or_path): |
|
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) |
|
|
|
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") |
|
vocoder = vocoder.to(device=device, dtype=dtype) |
|
|
|
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) |
|
pipe = pipe.to(device) |
|
|
|
width_start, width = 0, 160 |
|
strength_list = [desired_strength] |
|
prompt = prompt |
|
seed = 42 |
|
|
|
|
|
audio, sampling_rate = load_wav(audio_path) |
|
audio, spec = get_mel_spectrogram_from_audio(audio) |
|
norm_spec = normalize_spectrogram(spec) |
|
|
|
norm_spec = pad_spec(norm_spec, 1024) |
|
norm_spec = normalize(norm_spec) |
|
|
|
|
|
raw_image = image_add_color(torch_to_pil(norm_spec)) |
|
|
|
|
|
image_list = [] |
|
audio_list = [] |
|
|
|
generator = torch.Generator(device=device).manual_seed(seed) |
|
|
|
for strength in strength_list: |
|
with torch.autocast("cuda"): |
|
output_spec = pipe( |
|
prompt=prompt, image=norm_spec, num_inference_steps=100, generator=generator, output_type="pt", strength=strength, guidance_scale=7.5 |
|
).images[0] |
|
|
|
|
|
|
|
output_spec_image = torch_to_pil(output_spec) |
|
color_output_spec_image = image_add_color(output_spec_image) |
|
image_list.append(color_output_spec_image) |
|
|
|
|
|
denorm_spec = denormalize_spectrogram(output_spec) |
|
denorm_spec_audio = vocoder.inference(denorm_spec) |
|
audio_list.append(denorm_spec_audio) |
|
|
|
|
|
|
|
|
|
concat_image_list = [] |
|
for i in range(len(image_list)): |
|
if i == len(image_list) - 1: |
|
concat_image_list.append(np.array(image_list[i])) |
|
else: |
|
concat_image_list.append(np.concatenate([np.array(image_list[i]), np.ones((256, 20, 3))*0], axis=1)) |
|
|
|
concat_image = np.concatenate(concat_image_list, axis=1) |
|
concat_image = Image.fromarray(np.uint8(concat_image)) |
|
|
|
|
|
concat_audio_list = [np.concatenate([audio, np.zeros((1, 16000))], axis=1) for audio in audio_list] |
|
concat_audio = np.concatenate(concat_audio_list, axis=1) |
|
|
|
print("audio_path:", audio_path) |
|
print("width_start:", width_start, "width:", width) |
|
print("text prompt:", prompt) |
|
print("strength_list:", strength_list) |
|
|
|
|
|
concat_audio = concat_audio.flatten() |
|
|
|
|
|
concat_audio = concat_audio / np.max(np.abs(concat_audio)) |
|
|
|
|
|
sf.write("output.wav", concat_audio, 16000) |
|
|
|
|
|
input_spec_image_path = "input_spectrogram.png" |
|
raw_image.save(input_spec_image_path) |
|
|
|
|
|
output_spec_image_path = "output_spectrogram.png" |
|
concat_image.save(output_spec_image_path) |
|
|
|
return "output.wav", input_spec_image_path, output_spec_image_path |
|
|
|
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)): |
|
|
|
audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav") |
|
|
|
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" |
|
dtype = torch.float16 |
|
device = "cuda" |
|
|
|
if not os.path.isdir(pretrained_model_name_or_path): |
|
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) |
|
|
|
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") |
|
vocoder = vocoder.to(device=device, dtype=dtype) |
|
|
|
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) |
|
pipe = pipe.to(device) |
|
|
|
width_start, width = mask_start_point, mask_end_point-mask_start_point |
|
prompt = prompt |
|
seed = 42 |
|
|
|
|
|
audio, sampling_rate = load_wav(audio_path) |
|
audio, spec = get_mel_spectrogram_from_audio(audio) |
|
norm_spec = normalize_spectrogram(spec) |
|
norm_spec = pad_spec(norm_spec, 1024) |
|
norm_spec = normalize(norm_spec) |
|
|
|
raw_image = image_add_color(torch_to_pil(norm_spec)) |
|
|
|
|
|
mask = torch.zeros_like(norm_spec)[:1,...] |
|
mask[:, :, width_start:width_start+width] = 1 |
|
mask_image = torch_to_pil(mask) |
|
|
|
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) |
|
masked_spec_image = torch_to_pil(masked_spec) |
|
|
|
|
|
color_masked_spec_image = image_add_color(masked_spec_image) |
|
color_masked_spec_image = np.array(color_masked_spec_image) |
|
color_masked_spec_image[:, width_start:width_start+width, :] = 0 |
|
color_masked_spec_image = Image.fromarray(color_masked_spec_image) |
|
|
|
|
|
generator = torch.Generator(device=device).manual_seed(seed) |
|
|
|
with torch.autocast("cuda"): |
|
output_spec = pipe( |
|
prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt" |
|
).images[0] |
|
|
|
output_spec_image = torch_to_pil(output_spec) |
|
color_output_spec_image = image_add_color(output_spec_image) |
|
|
|
|
|
post_norm_spec = denormalize(norm_spec).to(device, dtype) |
|
raw_chunk_spec = denormalize_spectrogram(post_norm_spec) |
|
raw_chunk_audio = vocoder.inference(raw_chunk_spec) |
|
|
|
post_masked_spec = denormalize(masked_spec).to(device, dtype) |
|
denorm_masked_spec = denormalize_spectrogram(post_masked_spec) |
|
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec) |
|
|
|
denorm_spec = denormalize_spectrogram(output_spec) |
|
denorm_spec_audio = vocoder.inference(denorm_spec) |
|
|
|
|
|
|
|
|
|
denorm_spec_audio = denorm_spec_audio.flatten() |
|
|
|
|
|
denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) |
|
|
|
|
|
sf.write("output.wav", denorm_spec_audio, 16000) |
|
|
|
|
|
input_spec_image_path = "input_spectrogram.png" |
|
raw_image.save(input_spec_image_path) |
|
|
|
|
|
output_spec_image_path = "output_spectrogram.png" |
|
color_output_spec_image.save(output_spec_image_path) |
|
|
|
return "output.wav", input_spec_image_path, color_output_spec_image |
|
|
|
def load_input_spectrogram(audio_path): |
|
|
|
audio, sampling_rate = load_wav(audio_path) |
|
audio, spec = get_mel_spectrogram_from_audio(audio) |
|
norm_spec = normalize_spectrogram(spec) |
|
norm_spec = pad_spec(norm_spec, 1024) |
|
norm_spec = normalize(norm_spec) |
|
|
|
raw_image = image_add_color(torch_to_pil(norm_spec)) |
|
|
|
|
|
input_spec_image_path = "input_spectrogram.png" |
|
raw_image.save(input_spec_image_path) |
|
|
|
return input_spec_image_path |
|
|
|
def preview_masked_area(audio_path, mask_start_point, mask_end_point): |
|
|
|
audio, sampling_rate = load_wav(audio_path) |
|
audio, spec = get_mel_spectrogram_from_audio(audio) |
|
norm_spec = normalize_spectrogram(spec) |
|
norm_spec = pad_spec(norm_spec, 1024) |
|
norm_spec = normalize(norm_spec) |
|
|
|
|
|
width_start, width = mask_start_point, mask_end_point-mask_start_point |
|
mask = torch.zeros_like(norm_spec)[:1,...] |
|
mask[:, :, width_start:width_start+width] = 1 |
|
mask_image = torch_to_pil(mask) |
|
|
|
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) |
|
masked_spec_image = torch_to_pil(masked_spec) |
|
|
|
|
|
color_masked_spec_image = image_add_color(masked_spec_image) |
|
color_masked_spec_image = np.array(color_masked_spec_image) |
|
color_masked_spec_image[:, width_start:width_start+width, :] = 0 |
|
color_masked_spec_image = Image.fromarray(color_masked_spec_image) |
|
|
|
|
|
masked_spec_image_path = "masked_spectrogram.png" |
|
color_masked_spec_image.save(masked_spec_image_path) |
|
|
|
return masked_spec_image_path |
|
|
|
def load_inpaint_example(prompt_inp, audio_path): |
|
|
|
in_spec_path = load_input_spectrogram(audio_path) |
|
masked_spec_path = preview_masked_area(audio_path, 256, 768) |
|
|
|
return in_spec_path, masked_spec_path |
|
|
|
css=""" |
|
div#col-container{ |
|
margin: 0 auto; |
|
max-width: 640px; |
|
} |
|
""" |
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.Markdown("# Auffusion") |
|
gr.Markdown("Auffusion can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. ") |
|
gr.HTML(""" |
|
<div style="display:flex;column-gap:4px;"> |
|
<a href="https://auffusion.github.io/"> |
|
<img src='https://img.shields.io/badge/Project-Page-green'> |
|
</a> |
|
<a href="https://github.com/happylittlecat2333/Auffusion"> |
|
<img src='https://img.shields.io/badge/GitHub-Repo-blue'> |
|
</a> |
|
<a href="https://arxiv.org/pdf/2401.01044"> |
|
<img src='https://img.shields.io/badge/ArXiv-Paper-red'> |
|
</a> |
|
<a href="https://huggingface.co/spaces/fffiloni/auffusion?duplicate=true"> |
|
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space"> |
|
</a> |
|
</div> |
|
""") |
|
with gr.Tab("Text-to-Audio"): |
|
prompt = gr.Textbox(label="Prompt") |
|
submit_btn = gr.Button("Submit") |
|
audio_out = gr.Audio(label="Audio Ressult") |
|
|
|
gr.Examples( |
|
examples = [ |
|
"Rolling thunder with lightning strikes", |
|
"Two gunshots followed by birds chirping", |
|
"A train whistle blowing in the distance" |
|
], |
|
inputs = [prompt] |
|
) |
|
|
|
submit_btn.click( |
|
fn = infer, |
|
inputs = [prompt], |
|
outputs = [audio_out] |
|
) |
|
|
|
with gr.Tab("Audio-to-Audio"): |
|
prompt_img2img = gr.Textbox(label="Prompt") |
|
audio_in_img2img = gr.Audio(label="Audio Reference", type="filepath", format="wav") |
|
prompt_strength = gr.Slider(label="Prompt Strength", minimum=0.0, maximum=1.0, step=0.1, value=0.7) |
|
submit_btn_img2img = gr.Button("Submit") |
|
audio_out_img2img = gr.Audio(label="Audio Ressult") |
|
|
|
with gr.Accordion("Compare Spectrograms", open=False): |
|
with gr.Column(): |
|
input_spectrogram = gr.Image(label="Input Spectrogram") |
|
output_spectrogram = gr.Image(label="Output Spectrogram") |
|
|
|
gr.Examples( |
|
examples = [ |
|
["Ambulance siren", "./notebooks/examples/img2img/GIOApFAWDOc_160.wav"], |
|
["A cat is moewing", "./notebooks/examples/img2img/YniwgMbB6tpQ_01.wav"], |
|
["A car racing", "./notebooks/examples/img2img/_GI7meqlYZk_30.wav"] |
|
], |
|
inputs = [prompt_img2img, audio_in_img2img] |
|
) |
|
|
|
submit_btn_img2img.click( |
|
fn = infer_img2img, |
|
inputs = [prompt_img2img, audio_in_img2img, prompt_strength], |
|
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram] |
|
) |
|
|
|
with gr.Tab("Audio InPainting"): |
|
prompt_inp = gr.Textbox(label="Prompt") |
|
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath", format="wav") |
|
|
|
audio_in_spec = gr.Image(label="Audio IN spectrogram") |
|
mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256) |
|
mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768) |
|
preview_mask_btn = gr.Button("Preview Mask") |
|
|
|
masked_spec_preview = gr.Image(label="Spectrogram Mask Preview") |
|
submit_btn_inp = gr.Button("Submit") |
|
|
|
audio_out_inp = gr.Audio(label="Audio Ressult") |
|
|
|
with gr.Accordion("Compare Spectrograms", open=False): |
|
with gr.Column(): |
|
input_spectrogram_inp = gr.Image(label="Input Spectrogram") |
|
output_spectrogram_inp = gr.Image(label="Output Spectrogram") |
|
|
|
gr.Examples( |
|
examples = [ |
|
["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"], |
|
["A woman speaking", "./notebooks/examples/inpainting/9z8XIRyUq9Q_30.wav"], |
|
["An infant crying", "./notebooks/examples/inpainting/14ekd4nkpwc_28.wav"], |
|
["A dog barking and growling", "./notebooks/examples/inpainting/3ek-xLwr05Q_30.wav"] |
|
], |
|
fn = load_inpaint_example, |
|
inputs = [prompt_inp, audio_in_inp], |
|
outputs = [audio_in_spec, masked_spec_preview], |
|
cache_examples = True |
|
) |
|
|
|
audio_in_inp.upload( |
|
fn = load_input_spectrogram, |
|
inputs = [audio_in_inp], |
|
outputs = [audio_in_spec] |
|
) |
|
|
|
preview_mask_btn.click( |
|
fn = preview_masked_area, |
|
inputs = [audio_in_inp, mask_start_point, mask_end_point], |
|
outputs = [masked_spec_preview] |
|
) |
|
|
|
submit_btn_inp.click( |
|
fn = infer_inp, |
|
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point], |
|
outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp] |
|
) |
|
|
|
demo.queue().launch(show_api=False, show_error=True) |