Audio-To-MIDI-And-Advanced-Renderer

Running

avans06 commited on Aug 10

Commit

b114cd4

1 Parent(s): 80ab93c

feat: Implement stereo audio to MIDI transcription

This commit introduces a new stereo processing workflow for audio-to-MIDI transcription, allowing the preservation of spatial information from stereo recordings. The previous implementation was limited to mono processing.

Scale MIDI velocities by 0.8 in Stereo Transcription to avoid loudness/clipping after merge

Applied `scale_instrument_velocity(scale=0.8)` during Stereo Transcription to prevent excessive loudness caused by summing left and right channel MIDI tracks. This helps maintain a more natural dynamic range, avoiding clipping and ensuring more consistent perceived volume after rendering to WAV/FLAC.

Files changed (2) hide show

app.py +289 -36
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 # =================================================================
 #
-# Merged and Integrated Script for Audio/MIDI Processing and Rendering
 #
 # This script combines two functionalities:
 # 1. Transcribing audio to MIDI using two methods:
 #    a) A general-purpose model (basic-pitch by Spotify).
 #    b) A model specialized for solo piano (ByteDance).
 # 2. Applying advanced transformations and re-rendering MIDI files using:
-#    a) Standard SoundFonts via FluidSynth.
-#    b) A custom 8-bit style synthesizer for a chiptune sound.
 #
 # The user can upload a Audio (e.g., WAV, MP3), or MIDI file.
 # - If an audio file is uploaded, it is first transcribed to MIDI using the selected method.
@@ -29,7 +30,7 @@
 #
 #   pip install gradio torch pytz numpy scipy matplotlib networkx scikit-learn
 #   pip install piano_transcription_inference huggingface_hub
-#   pip install basic-pitch pretty_midi librosa
 #
 # =================================================================
 # Core modules:
@@ -42,6 +43,9 @@ import os
 import hashlib
 import time as reqtime
 import copy
 import torch
 import gradio as gr
@@ -60,7 +64,7 @@ import basic_pitch
 from basic_pitch.inference import predict
 from basic_pitch import ICASSP_2022_MODEL_PATH
-# --- Imports for 8-bit Synthesizer ---
 import pretty_midi
 import numpy as np
 from scipy import signal
@@ -158,18 +162,36 @@ def prepare_soundfonts():
     return ordered_soundfont_map
 # =================================================================================================
-# === 8-bit Style Synthesizer ===
 # =================================================================================================
 def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, vibrato_rate, vibrato_depth, bass_boost_level, fs=44100):
     """
     Synthesizes an 8-bit style audio waveform from a PrettyMIDI object.
     This function generates waveforms manually instead of using a synthesizer like FluidSynth.
     Includes an optional sub-octave bass booster with adjustable level.
     """
     total_duration = midi_data.get_end_time()
-    waveform = np.zeros(int(total_duration * fs) + fs)
-    for instrument in midi_data.instruments:
         for note in instrument.notes:
             freq = pretty_midi.note_number_to_hz(note.pitch)
             note_duration = note.end - note.start
@@ -222,13 +244,162 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
             start_sample = int(note.start * fs)
             end_sample = start_sample + num_samples
-            if end_sample > len(waveform):
-                end_sample = len(waveform)
                 note_waveform = note_waveform[:end_sample-start_sample]
-            waveform[start_sample:end_sample] += note_waveform
-    return waveform
 # =================================================================================================
 # === Stage 1: Audio to MIDI Transcription Functions ===
@@ -254,7 +425,7 @@ def TranscribePianoAudio(input_file):
     # Use os.path.join to create a platform-independent directory path
     output_dir = os.path.join("output", "transcribed_piano_")
     out_mid_path = os.path.join(output_dir, fn1 + '.mid')
     # Check for the directory's existence and create it if necessary
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
@@ -412,7 +583,7 @@ def Render_MIDI(input_midi_path,
         escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes)
     escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1)
     first_note_index = [e[0] for e in raw_score[1]].index('note')
     cscore = TMIDIX.chordify_score([1000, escore])
@@ -420,7 +591,7 @@ def Render_MIDI(input_midi_path,
     aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True)
     song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes)
     print('Done!')
     print('=' * 70)
     print('Input MIDI metadata:', meta_data[:5])
@@ -472,7 +643,7 @@ def Render_MIDI(input_midi_path,
         if render_transpose_to_C4:
             output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60
         if render_align == "Start Times":
             output_score = TMIDIX.recalculate_score_timings(output_score)
             output_score = TMIDIX.align_escore_notes_to_bars(output_score)
@@ -573,11 +744,12 @@ def Render_MIDI(input_midi_path,
                 s8bit_bass_boost_level,
                 fs=srate
             )
-            # Normalize audio
             peak_val = np.max(np.abs(audio))
             if peak_val > 0:
                 audio /= peak_val
-            audio = (audio * 32767).astype(np.int16)
         except Exception as e:
             print(f"Error during 8-bit synthesis: {e}")
             return [None] * 7
@@ -603,7 +775,7 @@ def Render_MIDI(input_midi_path,
         with open(midi_to_render_path, 'rb') as f:
             midi_file_content = f.read()
-        audio = midi_to_colab_audio(midi_file_content,
                                     soundfont_path=soundfont_path, # Use the dynamically found path
                                     sample_rate=srate,
                                     output_for_gradio=True
@@ -619,7 +791,7 @@ def Render_MIDI(input_midi_path,
     output_midi_summary = str(meta_data)
-    return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio), output_plot, song_description
 # =================================================================================================
 # === Main Application Logic ===
@@ -627,6 +799,7 @@ def Render_MIDI(input_midi_path,
 def process_and_render_file(input_file,
                             # --- Transcription params ---
                             transcription_method,
                             onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool,
                             # --- MIDI rendering params ---
@@ -645,14 +818,18 @@ def process_and_render_file(input_file,
     start_time = reqtime.time()
     if input_file is None:
         # Return a list of updates to clear all output fields
-        num_outputs = 7
-        return [gr.update(value=None)] * num_outputs
     # The input_file from gr.Audio(type="filepath") is now the direct path (a string),
     # not a temporary file object. We no longer need to access the .name attribute.
     input_file_path = input_file
     filename = os.path.basename(input_file_path)
     print(f"Processing new file: {filename}")
     # --- Step 1: Check file type and transcribe if necessary ---
     if filename.lower().endswith(('.mid', '.midi', '.kar')):
@@ -660,17 +837,86 @@ def process_and_render_file(input_file,
         midi_path_for_rendering = input_file_path
     else: #if filename.lower().endswith(('.wav', '.mp3'))
         print("Audio file detected. Starting transcription...")
-        try:
-            if transcription_method == "General Purpose":
-                midi_path_for_rendering = TranscribeGeneralAudio(
-                    input_file_path, onset_thresh, frame_thresh, min_note_len,
-                    min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
-                )
-            else: # Piano-Specific
-                midi_path_for_rendering = TranscribePianoAudio(input_file_path)
-        except Exception as e:
-            print(f"An error occurred during transcription: {e}")
-            raise gr.Error(f"Transcription Failed: {e}")
     # --- Step 2: Render the MIDI file with selected options ---
     print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
@@ -696,7 +942,7 @@ def update_ui_visibility(transcription_method, soundfont_choice):
     """
     is_general = (transcription_method == "General Purpose")
     is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
     return {
         general_transcription_settings: gr.update(visible=is_general),
         synth_8bit_settings: gr.update(visible=is_8bit),
@@ -751,8 +997,14 @@ if __name__ == "__main__":
                     value="General Purpose",
                     info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings."
                 )
-                # --- General Purpose (basic-pitch) Settings ---
                 with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                     onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
                     frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
@@ -775,7 +1027,7 @@ if __name__ == "__main__":
                 # --- Dynamically create the list of choices ---
                 soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys())
                 # Set a safe default value
-                default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else soundfont_choices[0]
                 soundfont_bank = gr.Dropdown(
                     soundfont_choices,
@@ -831,6 +1083,7 @@ if __name__ == "__main__":
         # --- Define all input components for the click event ---
         all_inputs = [
             input_file,
             transcription_method,
             onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency,
             infer_onsets, melodia_trick, multiple_pitch_bends,

 # =================================================================
 #
+# Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
 #
 # This script combines two functionalities:
 # 1. Transcribing audio to MIDI using two methods:
 #    a) A general-purpose model (basic-pitch by Spotify).
 #    b) A model specialized for solo piano (ByteDance).
+#    - Includes stereo processing by splitting channels, transcribing independently, and merging MIDI.
 # 2. Applying advanced transformations and re-rendering MIDI files using:
+#    a) Standard SoundFonts via FluidSynth (produces stereo audio).
+#    b) A custom 8-bit style synthesizer for a chiptune sound (updated for stereo output).
 #
 # The user can upload a Audio (e.g., WAV, MP3), or MIDI file.
 # - If an audio file is uploaded, it is first transcribed to MIDI using the selected method.
 #
 #   pip install gradio torch pytz numpy scipy matplotlib networkx scikit-learn
 #   pip install piano_transcription_inference huggingface_hub
+#   pip install basic-pitch pretty_midi librosa soundfile
 #
 # =================================================================
 # Core modules:
 import hashlib
 import time as reqtime
 import copy
+import librosa
+import pyloudnorm as pyln
+import soundfile as sf
 import torch
 import gradio as gr
 from basic_pitch.inference import predict
 from basic_pitch import ICASSP_2022_MODEL_PATH
+# --- Imports for 8-bit Synthesizer & MIDI Merging ---
 import pretty_midi
 import numpy as np
 from scipy import signal
     return ordered_soundfont_map
 # =================================================================================================
+# === 8-bit Style Synthesizer (Stereo Enabled) ===
 # =================================================================================================
 def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, vibrato_rate, vibrato_depth, bass_boost_level, fs=44100):
     """
     Synthesizes an 8-bit style audio waveform from a PrettyMIDI object.
     This function generates waveforms manually instead of using a synthesizer like FluidSynth.
     Includes an optional sub-octave bass booster with adjustable level.
+    Instruments are panned based on their order in the MIDI file.
+    Instrument 1 -> Left, Instrument 2 -> Right.
     """
     total_duration = midi_data.get_end_time()
+    # Initialize a stereo waveform buffer (2 channels: Left, Right)
+    waveform = np.zeros((2, int(total_duration * fs) + fs))
+    num_instruments = len(midi_data.instruments)
+    for i, instrument in enumerate(midi_data.instruments):
+        # --- Panning Logic ---
+        # Default to center-panned mono
+        pan_l, pan_r = 0.707, 0.707
+        if num_instruments == 2:
+            if i == 0:  # First instrument panned left
+                pan_l, pan_r = 1.0, 0.0
+            elif i == 1:  # Second instrument panned right
+                pan_l, pan_r = 0.0, 1.0
+        elif num_instruments > 2:
+            if i == 0: pan_l, pan_r = 1.0, 0.0 # Left
+            elif i == 1: pan_l, pan_r = 0.0, 1.0 # Right
+            # Other instruments remain centered
         for note in instrument.notes:
             freq = pretty_midi.note_number_to_hz(note.pitch)
             note_duration = note.end - note.start
             start_sample = int(note.start * fs)
             end_sample = start_sample + num_samples
+            if end_sample > waveform.shape[1]:
+                end_sample = waveform.shape[1]
                 note_waveform = note_waveform[:end_sample-start_sample]
+            # Add the mono note waveform to the stereo buffer with panning
+            waveform[0, start_sample:end_sample] += note_waveform * pan_l
+            waveform[1, start_sample:end_sample] += note_waveform * pan_r
+    return waveform # Returns a (2, N) numpy array
+def analyze_midi_velocity(midi_path):
+    midi = pretty_midi.PrettyMIDI(midi_path)
+    all_velocities = []
+    print(f"Analyzing velocity for MIDI: {midi_path}")
+    for i, instrument in enumerate(midi.instruments):
+        velocities = [note.velocity for note in instrument.notes]
+        all_velocities.extend(velocities)
+        if velocities:
+            print(f"Instrument {i} ({instrument.name}):")
+            print(f"  Notes count: {len(velocities)}")
+            print(f"  Velocity min: {min(velocities)}")
+            print(f"  Velocity max: {max(velocities)}")
+            print(f"  Velocity mean: {np.mean(velocities):.2f}")
+        else:
+            print(f"Instrument {i} ({instrument.name}): no notes found.")
+    if all_velocities:
+        print("\nOverall MIDI velocity stats:")
+        print(f"  Total notes: {len(all_velocities)}")
+        print(f"  Velocity min: {min(all_velocities)}")
+        print(f"  Velocity max: {max(all_velocities)}")
+        print(f"  Velocity mean: {np.mean(all_velocities):.2f}")
+    else:
+        print("No notes found in this MIDI.")
+def scale_instrument_velocity(instrument, scale=0.8):
+    for note in instrument.notes:
+        note.velocity = max(1, min(127, int(note.velocity * scale)))
+def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
+    """
+    Normalizes the audio data to a target integrated loudness (LUFS).
+    This provides more consistent perceived volume than peak normalization.
+    Args:
+        audio_data (np.ndarray): The audio signal.
+        sample_rate (int): The sample rate of the audio.
+        target_lufs (float): The target loudness in LUFS. Defaults to -23.0,
+                             a common standard for broadcast.
+    Returns:
+        np.ndarray: The loudness-normalized audio data.
+    """
+    try:
+        # 1. Measure the integrated loudness of the input audio
+        meter = pyln.Meter(sample_rate) # create meter
+        loudness = meter.integrated_loudness(audio_data)
+        # 2. Calculate the gain needed to reach the target loudness
+        # The gain is applied in the linear domain, so we convert from dB
+        loudness_gain_db = target_lufs - loudness
+        loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0)
+        # 3. Apply the gain
+        normalized_audio = audio_data * loudness_gain_linear
+        # 4. Final safety check: peak normalize to prevent clipping, just in case
+        # the loudness normalization results in peaks > 1.0
+        peak_val = np.max(np.abs(normalized_audio))
+        if peak_val > 1.0:
+            normalized_audio /= peak_val
+            print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.")
+        print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.")
+        return normalized_audio
+    except Exception as e:
+        print(f"Loudness normalization failed: {e}. Falling back to original audio.")
+        return audio_data
+# =================================================================================================
+# === MIDI Merging Function ===
+# =================================================================================================
+def merge_midis(midi_path_left, midi_path_right, output_path):
+    """
+    Merges two MIDI files into a single MIDI file. This robust version iterates
+    through ALL instruments in both MIDI files, ensuring no data is lost if the
+    source files are multi-instrumental.
+    It applies hard-left panning (Pan=0) to every instrument from the left MIDI
+    and hard-right panning (Pan=127) to every instrument from the right MIDI.
+    """
+    try:
+        analyze_midi_velocity(midi_path_left)
+        analyze_midi_velocity(midi_path_right)
+        midi_left = pretty_midi.PrettyMIDI(midi_path_left)
+        midi_right = pretty_midi.PrettyMIDI(midi_path_right)
+        merged_midi = pretty_midi.PrettyMIDI()
+        # --- Process ALL instruments from the left channel MIDI ---
+        if midi_left.instruments:
+            print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.")
+            # Use a loop to iterate through every instrument
+            for instrument in midi_left.instruments:
+                scale_instrument_velocity(instrument, scale=0.8)
+                # To avoid confusion, we can prefix the instrument name
+                instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}"
+                # Create and add the Pan Left control change
+                # Create a Control Change event for Pan (controller number 10).
+                # Set its value to 0 for hard left panning.
+                # Add it at the very beginning of the track (time=0.0).
+                pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0)
+                # Use insert() to ensure the pan event is the very first one
+                instrument.control_changes.insert(0, pan_left)
+                # Append the fully processed instrument to the merged MIDI
+                merged_midi.instruments.append(instrument)
+        # --- Process ALL instruments from the right channel MIDI ---
+        if midi_right.instruments:
+            print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.")
+            # Use a loop here as well
+            for instrument in midi_right.instruments:
+                scale_instrument_velocity(instrument, scale=0.8)
+                instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}"
+                # Create and add the Pan Right control change
+                # Create a Control Change event for Pan (controller number 10).
+                # Set its value to 127 for hard right panning.
+                # Add it at the very beginning of the track (time=0.0).
+                pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0)
+                instrument.control_changes.insert(0, pan_right)
+                merged_midi.instruments.append(instrument)
+        merged_midi.write(output_path)
+        print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'")
+        analyze_midi_velocity(output_path)
+        return output_path
+    except Exception as e:
+        print(f"Error merging MIDI files: {e}")
+        # Fallback logic remains the same
+        if os.path.exists(midi_path_left):
+            print("Fallback: Using only the left channel MIDI.")
+            return midi_path_left
+        return None
 # =================================================================================================
 # === Stage 1: Audio to MIDI Transcription Functions ===
     # Use os.path.join to create a platform-independent directory path
     output_dir = os.path.join("output", "transcribed_piano_")
     out_mid_path = os.path.join(output_dir, fn1 + '.mid')
     # Check for the directory's existence and create it if necessary
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
         escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes)
     escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1)
     first_note_index = [e[0] for e in raw_score[1]].index('note')
     cscore = TMIDIX.chordify_score([1000, escore])
     aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True)
     song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes)
     print('Done!')
     print('=' * 70)
     print('Input MIDI metadata:', meta_data[:5])
         if render_transpose_to_C4:
             output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60
         if render_align == "Start Times":
             output_score = TMIDIX.recalculate_score_timings(output_score)
             output_score = TMIDIX.align_escore_notes_to_bars(output_score)
                 s8bit_bass_boost_level,
                 fs=srate
             )
+            # Normalize and prepare for Gradio
             peak_val = np.max(np.abs(audio))
             if peak_val > 0:
                 audio /= peak_val
+            # Transpose from (2, N) to (N, 2) and convert to int16 for Gradio
+            audio_out = (audio.T * 32767).astype(np.int16)
         except Exception as e:
             print(f"Error during 8-bit synthesis: {e}")
             return [None] * 7
         with open(midi_to_render_path, 'rb') as f:
             midi_file_content = f.read()
+        audio_out = midi_to_colab_audio(midi_file_content,
                                     soundfont_path=soundfont_path, # Use the dynamically found path
                                     sample_rate=srate,
                                     output_for_gradio=True
     output_midi_summary = str(meta_data)
+    return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description
 # =================================================================================================
 # === Main Application Logic ===
 def process_and_render_file(input_file,
                             # --- Transcription params ---
+                            enable_stereo_processing,
                             transcription_method,
                             onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool,
                             # --- MIDI rendering params ---
     start_time = reqtime.time()
     if input_file is None:
         # Return a list of updates to clear all output fields
+        return [gr.update(value=None)] * 7
     # The input_file from gr.Audio(type="filepath") is now the direct path (a string),
     # not a temporary file object. We no longer need to access the .name attribute.
     input_file_path = input_file
     filename = os.path.basename(input_file_path)
     print(f"Processing new file: {filename}")
+    try:
+        audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False)
+    except Exception as e:
+        raise gr.Error(f"Failed to load audio file: {e}")
     # --- Step 1: Check file type and transcribe if necessary ---
     if filename.lower().endswith(('.mid', '.midi', '.kar')):
         midi_path_for_rendering = input_file_path
     else: #if filename.lower().endswith(('.wav', '.mp3'))
         print("Audio file detected. Starting transcription...")
+        base_name = os.path.splitext(filename)[0]
+        temp_dir = "output/temp_normalized"
+        os.makedirs(temp_dir, exist_ok=True)
+        # === STEREO PROCESSING LOGIC ===
+        if enable_stereo_processing:
+            if audio_data.ndim != 2 or audio_data.shape[0] != 2:
+                print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
+                enable_stereo_processing = False # Disable stereo processing if audio is not stereo
+        if enable_stereo_processing:
+            print("Stereo processing enabled. Splitting channels...")
+            try:
+                left_channel = audio_data[0]
+                right_channel = audio_data[1]
+                normalized_left = normalize_loudness(left_channel, native_sample_rate)
+                normalized_right = normalize_loudness(right_channel, native_sample_rate)
+                temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav")
+                temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav")
+                sf.write(temp_left_wav_path, normalized_left, native_sample_rate)
+                sf.write(temp_right_wav_path, normalized_right, native_sample_rate)
+                print(f"Saved left channel to: {temp_left_wav_path}")
+                print(f"Saved right channel to: {temp_right_wav_path}")
+                print("Transcribing left channel...")
+                if transcription_method == "General Purpose":
+                    midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+                else:
+                    midi_path_left = TranscribePianoAudio(temp_left_wav_path)
+                print("Transcribing right channel...")
+                if transcription_method == "General Purpose":
+                    midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+                else:
+                    midi_path_right = TranscribePianoAudio(temp_right_wav_path)
+                if midi_path_left and midi_path_right:
+                    merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
+                    midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path)
+                elif midi_path_left:
+                    print("Warning: Right channel transcription failed. Using left channel only.")
+                    midi_path_for_rendering = midi_path_left
+                elif midi_path_right:
+                    print("Warning: Left channel transcription failed. Using right channel only.")
+                    midi_path_for_rendering = midi_path_right
+                else:
+                     raise gr.Error("Both left and right channel transcriptions failed.")
+            except Exception as e:
+                print(f"An error occurred during stereo processing: {e}")
+                raise gr.Error(f"Stereo Processing Failed: {e}")
+        else:
+            print("Stereo processing disabled. Using standard mono transcription.")
+            if audio_data.ndim == 1:
+                mono_signal = audio_data
+            else:
+                mono_signal = np.mean(audio_data, axis=0)
+            normalized_mono = normalize_loudness(mono_signal, native_sample_rate)
+            temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
+            sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate)
+            try:
+                if transcription_method == "General Purpose":
+                    midi_path_for_rendering = TranscribeGeneralAudio(
+                        temp_mono_wav_path, onset_thresh, frame_thresh, min_note_len,
+                        min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
+                    )
+                else: # Piano-Specific
+                    midi_path_for_rendering = TranscribePianoAudio(temp_mono_wav_path)
+                analyze_midi_velocity(midi_path_for_rendering)
+            except Exception as e:
+                print(f"An error occurred during transcription: {e}")
+                raise gr.Error(f"Transcription Failed: {e}")
     # --- Step 2: Render the MIDI file with selected options ---
     print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
     """
     is_general = (transcription_method == "General Purpose")
     is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
     return {
         general_transcription_settings: gr.update(visible=is_general),
         synth_8bit_settings: gr.update(visible=is_8bit),
                     value="General Purpose",
                     info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings."
                 )
+                # --- Stereo Processing Checkbox ---
+                enable_stereo_processing = gr.Checkbox(
+                    label="Enable Stereo Transcription",
+                    value=False,
+                    info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
+                )
                 with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                     onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
                     frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
                 # --- Dynamically create the list of choices ---
                 soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys())
                 # Set a safe default value
+                default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else (soundfont_choices[0] if soundfont_choices else "")
                 soundfont_bank = gr.Dropdown(
                     soundfont_choices,
         # --- Define all input components for the click event ---
         all_inputs = [
             input_file,
+            enable_stereo_processing,
             transcription_method,
             onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency,
             infer_onsets, melodia_trick, multiple_pitch_bends,

requirements.txt CHANGED Viewed

@@ -16,6 +16,8 @@ networkx
 scikit-learn
 psutil
 pretty_midi
 piano_transcription_inference
 basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'

 scikit-learn
 psutil
 pretty_midi
+soundfile
+pyloudnorm
 piano_transcription_inference
 basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'