Audio-To-MIDI-And-Advanced-Renderer

Running

avans06 commited on Aug 17

Commit

2dbdd2e

1 Parent(s): 0f9efaa

feat(separation): Implement advanced multi-stem separation and processing

This commit significantly enhances the audio separation capabilities by exposing the full 4-stem power of the Demucs model (vocals, drums, bass, other), providing users with granular control over the transcription and audio merging pipeline.

Users can now:
- Choose between a simple 'Accompaniment' mode or an advanced mode to control each instrumental stem.
- Select multiple stems to be transcribed and automatically merged into a single MIDI file.
- Re-merge any of the original audio stems into the final rendered track.
- The UI dynamically adapts to the selected mode for a cleaner user experience.

Files changed (1) hide show

app.py +221 -140

app.py CHANGED Viewed

@@ -104,9 +104,23 @@ class AppParameters:
     # Global Settings
     s8bit_preset_selector: str = "Custom"
     separate_vocals: bool = False
-    remerge_vocals: bool = False
-    transcription_target: str = "Transcribe Music (Accompaniment)"
-    transcribe_both_stems: bool = False
     enable_stereo_processing: bool = False
     transcription_method: str = "General Purpose"
@@ -1333,10 +1347,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
     # --- Use the provided timestamp for unique filenames ---
     timestamped_base_name = f"{base_name}_{timestamp}"
-    # This will store the other part if separation is performed
-    other_part_tensor = None
-    other_part_sr = None
     # --- Step 1: Check file type and transcribe if necessary ---
     if is_midi_input:
@@ -1385,25 +1396,19 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                 print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}")
                 return None # Return None to indicate failure
-        # --- Demucs Vocal Separation Logic, now decides which stem to process ---
-        if not params.separate_vocals or demucs_model is None:
-            if params.separate_vocals and demucs_model is None:
-                print("ERROR: Demucs model not loaded. Skipping separation.")
-            # --- Standard Workflow: Transcribe the original full audio ---
-            audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac")
-            torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
-            update_progress(0.2, "Transcribing audio to MIDI...")
-            midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
-        else:
             # --- Vocal Separation Workflow ---
-            update_progress(0.2, "Separating vocals with Demucs...")
-            # Convert to a common format (stereo, float32) that demucs expects
             audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
             if torch.cuda.is_available():
                 audio_tensor = audio_tensor.cuda()
             print("Separating audio with Demucs... This may take some time.")
             # --- Wrap the model call in a no_grad() context ---
             with torch.no_grad():
@@ -1411,88 +1416,84 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                     demucs_model,
                     audio_tensor[None], # The input shape is [batch, channels, samples]
                     device='cuda' if torch.cuda.is_available() else 'cpu',
-                    progress=True,
                 )[0] # Remove the batch dimension from the output
             # --- Clear CUDA cache immediately after use ---
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
                 print("CUDA cache cleared.")
-            # --- Robust stem handling to prevent CUDA errors ---
-            # Instead of complex GPU indexing, we create a dictionary of stems on the CPU.
-            # This is safer and more robust across different hardware.
-            sources = {}
-            for i, source_name in enumerate(demucs_model.sources):
-                sources[source_name] = all_stems[i]
-            vocals_tensor = sources['vocals']
-            # Sum the other stems to create the accompaniment.
-            # This loop is safer than a single complex indexing operation.
-            accompaniment_tensor = torch.zeros_like(vocals_tensor)
-            for source_name, stem_tensor in sources.items():
-                if source_name != 'vocals':
-                    accompaniment_tensor += stem_tensor
-            # --- Save both stems to temporary files ---
-            vocals_path = os.path.join(temp_dir, f"{base_name}_vocals.flac")
-            accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac")
-            torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate)
-            torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate)
-            # --- Determine which stem is the primary target and which is the "other part" ---
-            primary_target_path = vocals_path if params.transcription_target == "Transcribe Vocals" else accompaniment_path
-            other_part_path = accompaniment_path if params.transcription_target == "Transcribe Vocals" else vocals_path
-            # Store the audio tensor of the "other part" for potential audio re-merging
-            other_part_tensor = accompaniment_tensor if params.transcription_target == "Transcribe Vocals" else vocals_tensor
-            other_part_sr = demucs_model.samplerate
-            print("Separation complete.")
-            # --- Main Branching Logic: Transcribe one or both stems ---
-            if not params.transcribe_both_stems:
-                print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}")
-                update_progress(0.4, f"Transcribing primary target: {os.path.basename(primary_target_path)}")
-                midi_path_for_rendering = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params)
             else:
-                print("Transcribing BOTH stems and merging the MIDI results.")
-                # Transcribe the primary target
-                update_progress(0.4, "Transcribing primary stem...")
-                midi_path_primary = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params)
-                # Transcribe the other part
-                update_progress(0.5, "Transcribing second stem...")
-                midi_path_other = _transcribe_stem(other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir, params)
-                # Merge the two resulting MIDI files
-                if midi_path_primary and midi_path_other:
-                    update_progress(0.55, "Merging transcribed MIDIs...")
-                    final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid")
-                    print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}")
-                    # A more robust MIDI merge is needed here
-                    primary_midi = pretty_midi.PrettyMIDI(midi_path_primary)
-                    other_midi = pretty_midi.PrettyMIDI(midi_path_other)
-                    # Add all instruments from the other midi to the primary one
-                    for instrument in other_midi.instruments:
-                        instrument.name = f"Other - {instrument.name}" # Rename to avoid confusion
-                        primary_midi.instruments.append(instrument)
-                    primary_midi.write(final_merged_midi_path)
-                    midi_path_for_rendering = final_merged_midi_path
-                elif midi_path_primary:
-                    print("Warning: Transcription of the 'other' part failed. Using primary transcription only.")
-                    midi_path_for_rendering = midi_path_primary
-                else:
-                    raise gr.Error("Transcription of the primary target failed. Aborting.")
     if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
         print(f"ERROR: Transcription failed for {filename}. Skipping.")
         return None
     # --- Step 2: Render the FINAL MIDI file with selected options ---
     # The progress values are now conditional based on the input file type.
     update_progress(0.1 if is_midi_input else 0.6, "Applying MIDI transformations...")
@@ -1515,60 +1516,70 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
         except Exception as e:
             print(f"Could not auto-recommend parameters for {filename}: {e}.")
     update_progress(0.2 if is_midi_input else 0.7, "Rendering MIDI to audio...")
     print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
     # Call the rendering function, Pass dictionaries directly to Render_MIDI
     results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params)
-    # --- Vocal Re-merging Logic ---
-    # Vocal Re-merging only happens for audio files, so its progress value doesn't need to be conditional.
-    if params.separate_vocals and params.remerge_vocals and not params.transcribe_both_stems and other_part_tensor is not None:
-        update_progress(0.8, "Re-merging rendered audio with vocals...")
-        print(f"Re-merging the non-transcribed part with newly rendered music...")
-        # 1. Unpack the original rendered audio from the results
         rendered_srate, rendered_music_int16 = results_tuple[4]
-        # 2. Convert the rendered music to a float tensor
         rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
-        rendered_music_tensor = torch.from_numpy(rendered_music_float).T
-        # 3. Resample if necessary
-        if rendered_srate != other_part_sr:
-            resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr)
-            rendered_music_tensor = resampler(rendered_music_tensor)
-        # 4. Pad to match lengths
-        len_music = rendered_music_tensor.shape[1]
-        len_other = other_part_tensor.shape[1]
-        if len_music > len_other:
-            padding = len_music - len_other
-            other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding))
-        elif len_other > len_music:
-            padding = len_other - len_music
-            rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding))
-        # 5. Merge and normalize
-        merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu()
-        max_abs = torch.max(torch.abs(merged_audio_tensor))
-        if max_abs > 1.0:
-            merged_audio_tensor /= max_abs
-        # 6. Convert back to the required format (int16 numpy array)
-        merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16)
-        # 7. Create the new audio tuple and UPDATE the main results_tuple
-        new_audio_tuple = (other_part_sr, merged_audio_int16)
         temp_results_list = list(results_tuple)
-        temp_results_list[4] = new_audio_tuple
         results_tuple = tuple(temp_results_list) # results_tuple is now updated
         print("Re-merging complete.")
     # --- Save final audio and return path ---
-    update_progress(0.9, "Saving final files...")
     final_srate, final_audio_data = results_tuple[4]
     final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI
@@ -1577,7 +1588,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
     output_midi_dir = "output/final_midi"
     os.makedirs(output_audio_dir, exist_ok=True)
     os.makedirs(output_midi_dir, exist_ok=True)
     final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac")
     # Also, copy the final processed MIDI to a consistent output directory with a timestamped name
     final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid")
@@ -2274,6 +2285,35 @@ if __name__ == "__main__":
                 updates[component] = gr.update(value=value)
         return updates
     # --- Use the dataclass to define the master list of parameter keys ---
     # This is now the single source of truth for parameter order.
@@ -2363,16 +2403,41 @@ if __name__ == "__main__":
                     enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False,
                             info="For stereo audio files only. When enabled, transcribes left and right channels independently, then merges them. Note: This will double the transcription time.")
-                    # --- Vocal Separation Checkboxes ---
                     with gr.Group():
-                        separate_vocals = gr.Checkbox(label="Separate Vocals", value=False,
-                                info="If checked, separates the audio into vocals and music stems before processing.")
-                        transcription_target = gr.Radio(["Transcribe Music (Accompaniment)", "Transcribe Vocals"], label="Transcription Target", value="Transcribe Music (Accompaniment)", visible=False,
-                                info="Choose which part of the separated audio to transcribe to MIDI.")
-                        remerge_vocals = gr.Checkbox(label="Re-merge Other Part with Rendered Audio", value=False, visible=False,
-                                info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.")
-                        transcribe_both_stems = gr.Checkbox(label="Transcribe Both Parts & Merge MIDI", value=False, visible=False,
-                                info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.")
                     with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                         # --- Preset dropdown for basic_pitch ---
@@ -2657,10 +2722,26 @@ if __name__ == "__main__":
         )
         # Event listeners for UI visibility and presets
         separate_vocals.change(
-            fn=update_vocal_ui_visibility,
             inputs=separate_vocals,
-            outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
         )
         # --- Listeners for dynamic UI updates ---

     # Global Settings
     s8bit_preset_selector: str = "Custom"
     separate_vocals: bool = False
+    # --- Advanced Separation and Merging Controls ---
+    enable_advanced_separation: bool = False # Controls visibility of advanced options
+    separate_drums: bool = True
+    separate_bass: bool = True
+    separate_other: bool = True
+    transcribe_vocals: bool = False
+    transcribe_drums: bool = False
+    transcribe_bass: bool = False
+    transcribe_other_or_accompaniment: bool = True # Default to transcribe 'other' as it's most common
+    merge_vocals_to_render: bool = False
+    merge_drums_to_render: bool = False
+    merge_bass_to_render: bool = False
+    merge_other_or_accompaniment: bool = False
     enable_stereo_processing: bool = False
     transcription_method: str = "General Purpose"
     # --- Use the provided timestamp for unique filenames ---
     timestamped_base_name = f"{base_name}_{timestamp}"
     # --- Step 1: Check file type and transcribe if necessary ---
     if is_midi_input:
                 print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}")
                 return None # Return None to indicate failure
+        # --- Demucs Vocal Separation Logic ---
+        # This block now handles multi-stem separation, transcription, and merging logic.
+        separated_stems = {} # This will store the audio tensors for merging
+        if params.separate_vocals and demucs_model is not None:
             # --- Vocal Separation Workflow ---
+            update_progress(0.2, "Separating audio with Demucs...")
+            # Convert to the format Demucs expects (e.g., 44.1kHz, stereo)
             audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
+            # Move tensor to GPU if available for faster processing
             if torch.cuda.is_available():
                 audio_tensor = audio_tensor.cuda()
             print("Separating audio with Demucs... This may take some time.")
             # --- Wrap the model call in a no_grad() context ---
             with torch.no_grad():
                     demucs_model,
                     audio_tensor[None], # The input shape is [batch, channels, samples]
                     device='cuda' if torch.cuda.is_available() else 'cpu',
+                    progress=True
                 )[0] # Remove the batch dimension from the output
             # --- Clear CUDA cache immediately after use ---
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
                 print("CUDA cache cleared.")
+            sources = {name: stem for name, stem in zip(demucs_model.sources, all_stems)}
+            # --- Store original stems for potential re-merging ---
+            for name, tensor in sources.items():
+                separated_stems[name] = (tensor.cpu(), demucs_model.samplerate)
+            # --- Prepare Stems for Transcription ---
+            stems_to_transcribe = {}
+            if params.enable_advanced_separation:
+                # User is in advanced mode, handle each stem individually
+                if params.transcribe_vocals:
+                    stems_to_transcribe['vocals'] = sources['vocals']
+                if params.transcribe_drums:
+                    stems_to_transcribe['drums'] = sources['drums']
+                if params.transcribe_bass:
+                    stems_to_transcribe['bass'] = sources['bass']
+                if params.transcribe_other_or_accompaniment:
+                    stems_to_transcribe['other'] = sources['other']
             else:
+                # User is in simple mode, create a single 'accompaniment' stem
+                accompaniment_tensor = sources['drums'] + sources['bass'] + sources['other']
+                if params.transcribe_vocals:
+                    stems_to_transcribe['vocals'] = sources['vocals']
+                if params.transcribe_other_or_accompaniment:
+                    stems_to_transcribe['accompaniment'] = accompaniment_tensor
+            # --- Transcribe Selected Stems ---
+            transcribed_midi_paths = []
+            if stems_to_transcribe:
+                stem_count = len(stems_to_transcribe)
+                for i, (name, tensor) in enumerate(stems_to_transcribe.items()):
+                    update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
+                    stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
+                    torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
+                    midi_path = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
+                    if midi_path:
+                        transcribed_midi_paths.append((name, midi_path))
+            # --- Merge Transcribed MIDIs ---
+            if not transcribed_midi_paths:
+                raise gr.Error("Separation was enabled, but no stems were selected for transcription, or transcription failed.")
+            elif len(transcribed_midi_paths) == 1:
+                midi_path_for_rendering = transcribed_midi_paths[0][1]
+            else:
+                update_progress(0.6, "Merging transcribed MIDIs...")
+                merged_midi = pretty_midi.PrettyMIDI()
+                for name, path in transcribed_midi_paths:
+                    try:
+                        midi_stem = pretty_midi.PrettyMIDI(path)
+                        for inst in midi_stem.instruments:
+                            inst.name = f"{name.capitalize()} - {inst.name}"
+                            merged_midi.instruments.append(inst)
+                    except Exception as e:
+                        print(f"Warning: Could not merge MIDI for stem {name}. Error: {e}")
+                final_merged_midi_path = os.path.join(temp_dir, f"{timestamped_base_name}_full_transcription.mid")
+                merged_midi.write(final_merged_midi_path)
+                midi_path_for_rendering = final_merged_midi_path
+        else: # Standard workflow without separation
+            # --- Standard Workflow: Transcribe the original full audio ---
+            audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac")
+            torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
+            update_progress(0.2, "Transcribing audio to MIDI...")
+            midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
     if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
         print(f"ERROR: Transcription failed for {filename}. Skipping.")
         return None
     # --- Step 2: Render the FINAL MIDI file with selected options ---
     # The progress values are now conditional based on the input file type.
     update_progress(0.1 if is_midi_input else 0.6, "Applying MIDI transformations...")
         except Exception as e:
             print(f"Could not auto-recommend parameters for {filename}: {e}.")
+    # --- Step 2: Render the FINAL MIDI file ---
     update_progress(0.2 if is_midi_input else 0.7, "Rendering MIDI to audio...")
     print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
     # Call the rendering function, Pass dictionaries directly to Render_MIDI
     results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params)
+    # --- Final Audio Merging Logic ---
+    stems_to_merge = []
+    if params.separate_vocals:
+        if params.merge_vocals_to_render and 'vocals' in separated_stems:
+            stems_to_merge.append(separated_stems['vocals'])
+        if params.enable_advanced_separation:
+            if params.merge_drums_to_render and 'drums' in separated_stems:
+                stems_to_merge.append(separated_stems['drums'])
+            if params.merge_bass_to_render and 'bass' in separated_stems:
+                stems_to_merge.append(separated_stems['bass'])
+            if params.merge_other_or_accompaniment and 'other' in separated_stems:
+                stems_to_merge.append(separated_stems['other'])
+        else: # Simple mode
+            if params.merge_other_or_accompaniment: # 'other' checkbox now controls the whole accompaniment
+                accompaniment_tensor = separated_stems['drums'][0] + separated_stems['bass'][0] + separated_stems['other'][0]
+                stems_to_merge.append((accompaniment_tensor, demucs_model.samplerate))
+    if stems_to_merge:
+        update_progress(0.9, "Re-merging audio stems...")
         rendered_srate, rendered_music_int16 = results_tuple[4]
         rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
+        final_mix_tensor = torch.from_numpy(rendered_music_float).T
+        final_srate = rendered_srate
+        for stem_tensor, stem_srate in stems_to_merge:
+            # Resample if necessary
+            if stem_srate != final_srate:
+                # Resample all stems to match the rendered audio's sample rate
+                resampler = torchaudio.transforms.Resample(stem_srate, final_srate)
+                stem_tensor = resampler(stem_tensor)
+            # Pad and add to the final mix
+            len_mix = final_mix_tensor.shape[1]
+            len_stem = stem_tensor.shape[1]
+            if len_mix > len_stem:
+                stem_tensor = torch.nn.functional.pad(stem_tensor, (0, len_mix - len_stem))
+            elif len_stem > len_mix:
+                final_mix_tensor = torch.nn.functional.pad(final_mix_tensor, (0, len_stem - len_mix))
+            final_mix_tensor += stem_tensor
+        # Normalize final mix to prevent clipping
+        max_abs = torch.max(torch.abs(final_mix_tensor))
+        if max_abs > 1.0: final_mix_tensor /= max_abs
+        # Convert back to the required format (int16 numpy array)
+        merged_audio_int16 = (final_mix_tensor.T.numpy() * 32767).astype(np.int16)
+        # Update the results tuple with the newly merged audio
         temp_results_list = list(results_tuple)
+        temp_results_list[4] = (final_srate, merged_audio_int16)
         results_tuple = tuple(temp_results_list) # results_tuple is now updated
         print("Re-merging complete.")
     # --- Save final audio and return path ---
+    update_progress(0.95, "Saving final files...")
     final_srate, final_audio_data = results_tuple[4]
     final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI
     output_midi_dir = "output/final_midi"
     os.makedirs(output_audio_dir, exist_ok=True)
     os.makedirs(output_midi_dir, exist_ok=True)
     final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac")
     # Also, copy the final processed MIDI to a consistent output directory with a timestamped name
     final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid")
                 updates[component] = gr.update(value=value)
         return updates
+    # --- UI Controller Function for Dynamic Visibility ---
+    def update_separation_mode_ui(is_advanced):
+        """
+        Updates the visibility and labels of UI components based on whether
+        the advanced separation mode is enabled.
+        """
+        if is_advanced:
+            # Advanced Mode: Show individual controls, label becomes "Other"
+            return {
+                advanced_separation_controls: gr.update(visible=True),
+                transcribe_drums: gr.update(visible=True),
+                transcribe_bass: gr.update(visible=True),
+                transcribe_other_or_accompaniment: gr.update(label="Transcribe Other"),
+                merge_drums_to_render: gr.update(visible=True),
+                merge_bass_to_render: gr.update(visible=True),
+                merge_other_or_accompaniment: gr.update(label="Merge Other")
+            }
+        else:
+            # Simple Mode: Hide individual controls, label becomes "Accompaniment"
+            return {
+                advanced_separation_controls: gr.update(visible=False),
+                transcribe_drums: gr.update(visible=False),
+                transcribe_bass: gr.update(visible=False),
+                transcribe_other_or_accompaniment: gr.update(label="Transcribe Accompaniment"),
+                merge_drums_to_render: gr.update(visible=False),
+                merge_bass_to_render: gr.update(visible=False),
+                merge_other_or_accompaniment: gr.update(label="Merge Accompaniment")
+            }
     # --- Use the dataclass to define the master list of parameter keys ---
     # This is now the single source of truth for parameter order.
                     enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False,
                             info="For stereo audio files only. When enabled, transcribes left and right channels independently, then merges them. Note: This will double the transcription time.")
+                    # --- Vocal Separation Group ---
                     with gr.Group():
+                        separate_vocals = gr.Checkbox(label="Enable Source Separation (Demucs)", value=False,
+                                info="If checked, separates the audio into its component stems (vocals, drums, etc.) before processing.")
+                        # --- Container for all separation options, visible only when enabled ---
+                        with gr.Group(visible=False) as separation_options_box:
+                            gr.Markdown("#### 1. Stem Separation Options")
+                            enable_advanced_separation = gr.Checkbox(label="Enable Advanced Stem Control (for Accompaniment)", value=False,
+                                info="If checked, you can individually control drums, bass, and other. If unchecked, they are treated as a single 'Accompaniment' track.")
+                            with gr.Row(visible=False) as advanced_separation_controls:
+                                separate_drums = gr.Checkbox(label="Drums", value=True)
+                                separate_bass = gr.Checkbox(label="Bass", value=True)
+                                separate_other = gr.Checkbox(label="Other", value=True)
+                            gr.Markdown("#### 2. Transcription Targets")
+                            gr.Markdown("_Select which separated stem(s) to convert to MIDI._")
+                            with gr.Row():
+                                transcribe_vocals = gr.Checkbox(label="Transcribe Vocals", value=False)
+                                # These two will be hidden/shown dynamically
+                                transcribe_drums = gr.Checkbox(label="Transcribe Drums", value=False, visible=False)
+                                transcribe_bass = gr.Checkbox(label="Transcribe Bass", value=False, visible=False)
+                                # This checkbox will have its label changed dynamically
+                                transcribe_other_or_accompaniment = gr.Checkbox(label="Transcribe Accompaniment", value=True)
+                            gr.Markdown("#### 3. Audio Merging Targets")
+                            gr.Markdown("_Select which original stem(s) to re-merge with the final rendered audio._")
+                            with gr.Row():
+                                merge_vocals_to_render = gr.Checkbox(label="Merge Vocals", value=False)
+                                # These two will be hidden/shown dynamically
+                                merge_drums_to_render = gr.Checkbox(label="Merge Drums", value=False, visible=False)
+                                merge_bass_to_render = gr.Checkbox(label="Merge Bass", value=False, visible=False)
+                                # This checkbox will have its label changed dynamically
+                                merge_other_or_accompaniment = gr.Checkbox(label="Merge Accompaniment", value=True)
                     with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                         # --- Preset dropdown for basic_pitch ---
         )
         # Event listeners for UI visibility and presets
+        # When the main separation checkbox is toggled
         separate_vocals.change(
+            fn=lambda x: gr.update(visible=x),
             inputs=separate_vocals,
+            outputs=[separation_options_box]
+        )
+        # When the advanced stem control checkbox is toggled, update all relevant UI parts
+        enable_advanced_separation.change(
+            fn=update_separation_mode_ui,
+            inputs=enable_advanced_separation,
+            outputs=[
+                advanced_separation_controls,
+                transcribe_drums,
+                transcribe_bass,
+                transcribe_other_or_accompaniment,
+                merge_drums_to_render,
+                merge_bass_to_render,
+                merge_other_or_accompaniment
+            ]
         )
         # --- Listeners for dynamic UI updates ---