audiosr_versatile_audio_super_resolution

Running

App Files Files Community

haoheliu commited on Dec 1, 2024

Commit

26c3c7a

verified ·

1 Parent(s): 56f0306

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -15

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import tempfile
 import numpy as np
 import os
-# Set MPS device if available (for Mac M-Series GPUs)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Title and Description
@@ -22,7 +22,7 @@ Only the first 10 seconds of the audio will be processed.
 # Upload audio file
 uploaded_file = st.file_uploader("Upload an audio file (WAV format)", type=["wav"])
-# Model Parameters
 st.sidebar.title("Model Parameters")
 model_name = st.sidebar.selectbox("Select Model", ["basic", "speech"], index=0)
 ddim_steps = st.sidebar.slider("DDIM Steps", min_value=10, max_value=100, value=50)
@@ -30,18 +30,27 @@ guidance_scale = st.sidebar.slider("Guidance Scale", min_value=1.0, max_value=10
 random_seed = st.sidebar.number_input("Random Seed", min_value=0, value=42, step=1)
 latent_t_per_second = 12.8
-# Helper function to plot spectrogram
 def plot_spectrogram(waveform, sample_rate, title):
     plt.figure(figsize=(10, 4))
-    spectrogram = torchaudio.transforms.MelSpectrogram(
-        sample_rate=sample_rate, n_fft=2048, hop_length=512, n_mels=128
-    )(torch.tensor(waveform))
-    log_spectrogram = torchaudio.transforms.AmplitudeToDB()(spectrogram)
-    plt.imshow(log_spectrogram.numpy(), aspect="auto", origin="lower", cmap="viridis")
     plt.colorbar(format="%+2.0f dB")
     plt.title(title)
-    plt.xlabel("Time")
-    plt.ylabel("Frequency")
     plt.tight_layout()
     st.pyplot(plt)
@@ -49,7 +58,6 @@ def plot_spectrogram(waveform, sample_rate, title):
 if uploaded_file and st.button("Enhance Audio"):
     st.write("Processing audio...")
-    # Create temp directory for saving files
     with tempfile.TemporaryDirectory() as temp_dir:
         input_path = os.path.join(temp_dir, "input.wav")
         truncated_path = os.path.join(temp_dir, "truncated.wav")
@@ -59,7 +67,7 @@ if uploaded_file and st.button("Enhance Audio"):
         with open(input_path, "wb") as f:
             f.write(uploaded_file.read())
-        # Load audio and truncate the first 10 seconds
         waveform, sample_rate = torchaudio.load(input_path)
         max_samples = sample_rate * 10  # First 10 seconds
         if waveform.size(1) > max_samples:
@@ -85,11 +93,12 @@ if uploaded_file and st.button("Enhance Audio"):
         )
         # Save enhanced audio
-        save_wave(waveform_sr, inputpath=truncated_path, savepath=temp_dir, name="output", samplerate=48000)
-        # Plot output spectrogram
         st.write("Enhanced Audio Spectrogram:")
-        plot_spectrogram(waveform_sr.numpy(), 48000, title="Enhanced Audio Spectrogram")
         # Display audio players and download link
         st.audio(truncated_path, format="audio/wav")

 import numpy as np
 import os
+# Set device (MPS for Mac, CUDA for other GPUs, otherwise CPU)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Title and Description
 # Upload audio file
 uploaded_file = st.file_uploader("Upload an audio file (WAV format)", type=["wav"])
+# Sidebar: Model Parameters
 st.sidebar.title("Model Parameters")
 model_name = st.sidebar.selectbox("Select Model", ["basic", "speech"], index=0)
 ddim_steps = st.sidebar.slider("DDIM Steps", min_value=10, max_value=100, value=50)
 random_seed = st.sidebar.number_input("Random Seed", min_value=0, value=42, step=1)
 latent_t_per_second = 12.8
+# Helper function: Plot linear STFT spectrogram
 def plot_spectrogram(waveform, sample_rate, title):
     plt.figure(figsize=(10, 4))
+    spectrogram = torch.stft(
+        torch.tensor(waveform),
+        n_fft=2048,
+        hop_length=512,
+        win_length=2048,
+        return_complex=True,
+    ).abs().numpy()
+    plt.imshow(
+        np.log1p(spectrogram),
+        aspect="auto",
+        origin="lower",
+        extent=[0, waveform.shape[-1] / sample_rate, 0, sample_rate / 2],
+        cmap="viridis",
+    )
     plt.colorbar(format="%+2.0f dB")
     plt.title(title)
+    plt.xlabel("Time (s)")
+    plt.ylabel("Frequency (Hz)")
     plt.tight_layout()
     st.pyplot(plt)
 if uploaded_file and st.button("Enhance Audio"):
     st.write("Processing audio...")
     with tempfile.TemporaryDirectory() as temp_dir:
         input_path = os.path.join(temp_dir, "input.wav")
         truncated_path = os.path.join(temp_dir, "truncated.wav")
         with open(input_path, "wb") as f:
             f.write(uploaded_file.read())
+        # Load and truncate the first 10 seconds
         waveform, sample_rate = torchaudio.load(input_path)
         max_samples = sample_rate * 10  # First 10 seconds
         if waveform.size(1) > max_samples:
         )
         # Save enhanced audio
+        output_waveform = waveform_sr.detach().numpy()
+        save_wave(torch.tensor(output_waveform), inputpath=truncated_path, savepath=temp_dir, name="output", samplerate=48000)
+        # Plot enhanced spectrogram
         st.write("Enhanced Audio Spectrogram:")
+        plot_spectrogram(output_waveform, 48000, title="Enhanced Audio Spectrogram")
         # Display audio players and download link
         st.audio(truncated_path, format="audio/wav")