|
import os |
|
from typing import List |
|
import gradio as gr |
|
import tempfile |
|
import numpy as np |
|
import librosa |
|
import soundfile as sf |
|
import spaces |
|
from audiosr import super_resolution |
|
from audiosr_utils import load_audiosr |
|
|
|
|
|
audiosr_model = load_audiosr() |
|
|
|
|
|
def split_audio_to_chunks(y, sr=48000, chunk_duration=5.12) -> List[str]: |
|
|
|
chunk_samples = int(chunk_duration * sr) |
|
|
|
|
|
chunks = [y[i : i + chunk_samples] for i in range(0, len(y), chunk_samples)] |
|
|
|
|
|
temp_files = [] |
|
for i, chunk in enumerate(chunks): |
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
temp_files.append(temp_file.name) |
|
|
|
|
|
sf.write(temp_file.name, chunk, sr) |
|
|
|
return temp_files |
|
|
|
|
|
@spaces.GPU() |
|
def run_audiosr( |
|
chunks: List[str], guidance_scale: float, ddim_steps: int |
|
) -> np.ndarray: |
|
waveforms = [] |
|
for i, chunk in enumerate(chunks): |
|
print(f"Processing chunk {i+1}/{len(chunks)}") |
|
waveform = super_resolution( |
|
audiosr_model, |
|
chunk, |
|
guidance_scale=guidance_scale, |
|
ddim_steps=ddim_steps, |
|
) |
|
waveforms.append(waveform) |
|
waveform = np.concatenate(waveforms, axis=-1) |
|
waveform = waveform.squeeze() |
|
return waveform |
|
|
|
|
|
def audiosr_infer(audio: str) -> str: |
|
guidance_scale = 3.5 |
|
ddim_steps = 100 |
|
|
|
y, sr = librosa.load(audio, sr=48000) |
|
if len(y) > 60 * sr: |
|
y = y[: 60 * sr] |
|
gr.Info("Audio is too long, only the first 60 seconds will be processed") |
|
|
|
chunk_files = split_audio_to_chunks(y, sr=sr, chunk_duration=5.12) |
|
print(f"Splited audio chunks: {chunk_files}") |
|
|
|
waveform = run_audiosr(chunk_files, guidance_scale, ddim_steps) |
|
sr = 44100 |
|
|
|
for chunk_file in chunk_files: |
|
os.remove(chunk_file) |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: |
|
sf.write(f.name, waveform, sr) |
|
return f.name |
|
|
|
|
|
models = { |
|
"AudioSR": audiosr_infer, |
|
} |
|
|
|
|
|
def infer(audio: str, model: str, sr: int) -> str: |
|
if sr > 0: |
|
|
|
y, _ = librosa.load(audio, sr=sr) |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
|
sf.write(f.name, y, sr) |
|
return models[model](f.name) |
|
else: |
|
return models[model](audio) |
|
|
|
|
|
with gr.Blocks() as app: |
|
with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f: |
|
README = f.read() |
|
|
|
blocks = README.split("---") |
|
if len(blocks) > 1: |
|
README = "---".join(blocks[2:]) |
|
|
|
gr.Markdown(README) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("## Upload an audio file") |
|
audio = gr.Audio(label="Upload an audio file", type="filepath") |
|
sr = gr.Slider( |
|
value=0, |
|
label="Resample audio to sample rate before inference, 0 means no resampling", |
|
minimum=0, |
|
maximum=48000, |
|
step=1000, |
|
) |
|
|
|
with gr.Row(): |
|
model = gr.Radio( |
|
label="Select a model", |
|
choices=[s for s in models.keys()], |
|
value="AudioSR", |
|
) |
|
btn = gr.Button("Infer") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
out = gr.Audio( |
|
label="Output", format="mp3", type="filepath", interactive=False |
|
) |
|
|
|
btn.click( |
|
fn=infer, |
|
inputs=[audio, model, sr], |
|
outputs=[out], |
|
api_name="infer", |
|
) |
|
|
|
app.launch(show_error=True) |
|
|