feat(separation): Implement advanced multi-stem separation and processing
Browse filesThis commit significantly enhances the audio separation capabilities by exposing the full 4-stem power of the Demucs model (vocals, drums, bass, other), providing users with granular control over the transcription and audio merging pipeline.
Users can now:
- Choose between a simple 'Accompaniment' mode or an advanced mode to control each instrumental stem.
- Select multiple stems to be transcribed and automatically merged into a single MIDI file.
- Re-merge any of the original audio stems into the final rendered track.
- The UI dynamically adapts to the selected mode for a cleaner user experience.
app.py
CHANGED
|
@@ -104,9 +104,23 @@ class AppParameters:
|
|
| 104 |
# Global Settings
|
| 105 |
s8bit_preset_selector: str = "Custom"
|
| 106 |
separate_vocals: bool = False
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
enable_stereo_processing: bool = False
|
| 111 |
transcription_method: str = "General Purpose"
|
| 112 |
|
|
@@ -1333,10 +1347,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
| 1333 |
|
| 1334 |
# --- Use the provided timestamp for unique filenames ---
|
| 1335 |
timestamped_base_name = f"{base_name}_{timestamp}"
|
| 1336 |
-
|
| 1337 |
-
# This will store the other part if separation is performed
|
| 1338 |
-
other_part_tensor = None
|
| 1339 |
-
other_part_sr = None
|
| 1340 |
|
| 1341 |
# --- Step 1: Check file type and transcribe if necessary ---
|
| 1342 |
if is_midi_input:
|
|
@@ -1385,25 +1396,19 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
| 1385 |
print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}")
|
| 1386 |
return None # Return None to indicate failure
|
| 1387 |
|
| 1388 |
-
# --- Demucs Vocal Separation Logic
|
| 1389 |
-
|
| 1390 |
-
|
| 1391 |
-
|
| 1392 |
-
|
| 1393 |
-
audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac")
|
| 1394 |
-
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
|
| 1395 |
-
|
| 1396 |
-
update_progress(0.2, "Transcribing audio to MIDI...")
|
| 1397 |
-
midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
|
| 1398 |
-
else:
|
| 1399 |
# --- Vocal Separation Workflow ---
|
| 1400 |
-
update_progress(0.2, "Separating
|
| 1401 |
-
# Convert to
|
| 1402 |
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
|
| 1403 |
-
|
| 1404 |
if torch.cuda.is_available():
|
| 1405 |
audio_tensor = audio_tensor.cuda()
|
| 1406 |
-
|
| 1407 |
print("Separating audio with Demucs... This may take some time.")
|
| 1408 |
# --- Wrap the model call in a no_grad() context ---
|
| 1409 |
with torch.no_grad():
|
|
@@ -1411,88 +1416,84 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
| 1411 |
demucs_model,
|
| 1412 |
audio_tensor[None], # The input shape is [batch, channels, samples]
|
| 1413 |
device='cuda' if torch.cuda.is_available() else 'cpu',
|
| 1414 |
-
progress=True
|
| 1415 |
)[0] # Remove the batch dimension from the output
|
| 1416 |
|
| 1417 |
# --- Clear CUDA cache immediately after use ---
|
| 1418 |
if torch.cuda.is_available():
|
| 1419 |
torch.cuda.empty_cache()
|
| 1420 |
print("CUDA cache cleared.")
|
| 1421 |
-
|
| 1422 |
-
|
| 1423 |
-
|
| 1424 |
-
#
|
| 1425 |
-
|
| 1426 |
-
|
| 1427 |
-
|
| 1428 |
-
|
| 1429 |
-
|
| 1430 |
-
|
| 1431 |
-
|
| 1432 |
-
|
| 1433 |
-
|
| 1434 |
-
|
| 1435 |
-
|
| 1436 |
-
|
| 1437 |
-
|
| 1438 |
-
|
| 1439 |
-
|
| 1440 |
-
accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac")
|
| 1441 |
-
torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate)
|
| 1442 |
-
torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate)
|
| 1443 |
-
|
| 1444 |
-
# --- Determine which stem is the primary target and which is the "other part" ---
|
| 1445 |
-
primary_target_path = vocals_path if params.transcription_target == "Transcribe Vocals" else accompaniment_path
|
| 1446 |
-
other_part_path = accompaniment_path if params.transcription_target == "Transcribe Vocals" else vocals_path
|
| 1447 |
-
|
| 1448 |
-
# Store the audio tensor of the "other part" for potential audio re-merging
|
| 1449 |
-
other_part_tensor = accompaniment_tensor if params.transcription_target == "Transcribe Vocals" else vocals_tensor
|
| 1450 |
-
other_part_sr = demucs_model.samplerate
|
| 1451 |
-
print("Separation complete.")
|
| 1452 |
-
|
| 1453 |
-
# --- Main Branching Logic: Transcribe one or both stems ---
|
| 1454 |
-
if not params.transcribe_both_stems:
|
| 1455 |
-
print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}")
|
| 1456 |
-
update_progress(0.4, f"Transcribing primary target: {os.path.basename(primary_target_path)}")
|
| 1457 |
-
midi_path_for_rendering = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params)
|
| 1458 |
else:
|
| 1459 |
-
|
| 1460 |
-
|
| 1461 |
-
|
| 1462 |
-
|
| 1463 |
-
|
|
|
|
| 1464 |
|
| 1465 |
-
|
| 1466 |
-
|
| 1467 |
-
|
| 1468 |
-
|
| 1469 |
-
|
| 1470 |
-
|
| 1471 |
-
|
| 1472 |
-
|
| 1473 |
-
|
| 1474 |
-
|
| 1475 |
-
|
| 1476 |
-
|
| 1477 |
-
|
| 1478 |
-
|
| 1479 |
-
|
| 1480 |
-
|
| 1481 |
-
|
| 1482 |
-
|
| 1483 |
-
|
| 1484 |
-
|
| 1485 |
-
|
| 1486 |
-
|
| 1487 |
-
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1491 |
|
| 1492 |
if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
|
| 1493 |
print(f"ERROR: Transcription failed for {filename}. Skipping.")
|
| 1494 |
return None
|
| 1495 |
-
|
| 1496 |
# --- Step 2: Render the FINAL MIDI file with selected options ---
|
| 1497 |
# The progress values are now conditional based on the input file type.
|
| 1498 |
update_progress(0.1 if is_midi_input else 0.6, "Applying MIDI transformations...")
|
|
@@ -1515,60 +1516,70 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
| 1515 |
except Exception as e:
|
| 1516 |
print(f"Could not auto-recommend parameters for {filename}: {e}.")
|
| 1517 |
|
|
|
|
| 1518 |
update_progress(0.2 if is_midi_input else 0.7, "Rendering MIDI to audio...")
|
| 1519 |
print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
|
| 1520 |
|
| 1521 |
# Call the rendering function, Pass dictionaries directly to Render_MIDI
|
| 1522 |
results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params)
|
| 1523 |
-
|
| 1524 |
-
# ---
|
| 1525 |
-
|
| 1526 |
-
if params.separate_vocals
|
| 1527 |
-
|
| 1528 |
-
|
| 1529 |
|
| 1530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1531 |
rendered_srate, rendered_music_int16 = results_tuple[4]
|
| 1532 |
-
|
| 1533 |
-
# 2. Convert the rendered music to a float tensor
|
| 1534 |
rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
|
| 1535 |
-
|
|
|
|
| 1536 |
|
| 1537 |
-
|
| 1538 |
-
|
| 1539 |
-
|
| 1540 |
-
|
|
|
|
|
|
|
| 1541 |
|
| 1542 |
-
|
| 1543 |
-
|
| 1544 |
-
|
| 1545 |
-
|
| 1546 |
-
|
| 1547 |
-
|
| 1548 |
-
|
| 1549 |
-
elif len_other > len_music:
|
| 1550 |
-
padding = len_other - len_music
|
| 1551 |
-
rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding))
|
| 1552 |
|
| 1553 |
-
|
| 1554 |
-
merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu()
|
| 1555 |
-
max_abs = torch.max(torch.abs(merged_audio_tensor))
|
| 1556 |
-
if max_abs > 1.0:
|
| 1557 |
-
merged_audio_tensor /= max_abs
|
| 1558 |
|
| 1559 |
-
#
|
| 1560 |
-
|
|
|
|
| 1561 |
|
| 1562 |
-
#
|
| 1563 |
-
|
| 1564 |
|
|
|
|
| 1565 |
temp_results_list = list(results_tuple)
|
| 1566 |
-
temp_results_list[4] =
|
| 1567 |
results_tuple = tuple(temp_results_list) # results_tuple is now updated
|
| 1568 |
print("Re-merging complete.")
|
| 1569 |
-
|
| 1570 |
# --- Save final audio and return path ---
|
| 1571 |
-
update_progress(0.
|
| 1572 |
final_srate, final_audio_data = results_tuple[4]
|
| 1573 |
final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI
|
| 1574 |
|
|
@@ -1577,7 +1588,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
| 1577 |
output_midi_dir = "output/final_midi"
|
| 1578 |
os.makedirs(output_audio_dir, exist_ok=True)
|
| 1579 |
os.makedirs(output_midi_dir, exist_ok=True)
|
| 1580 |
-
|
| 1581 |
final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac")
|
| 1582 |
# Also, copy the final processed MIDI to a consistent output directory with a timestamped name
|
| 1583 |
final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid")
|
|
@@ -2274,6 +2285,35 @@ if __name__ == "__main__":
|
|
| 2274 |
updates[component] = gr.update(value=value)
|
| 2275 |
|
| 2276 |
return updates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2277 |
|
| 2278 |
# --- Use the dataclass to define the master list of parameter keys ---
|
| 2279 |
# This is now the single source of truth for parameter order.
|
|
@@ -2363,16 +2403,41 @@ if __name__ == "__main__":
|
|
| 2363 |
enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False,
|
| 2364 |
info="For stereo audio files only. When enabled, transcribes left and right channels independently, then merges them. Note: This will double the transcription time.")
|
| 2365 |
|
| 2366 |
-
# --- Vocal Separation
|
| 2367 |
with gr.Group():
|
| 2368 |
-
separate_vocals = gr.Checkbox(label="
|
| 2369 |
-
info="If checked, separates the audio into vocals
|
| 2370 |
-
|
| 2371 |
-
|
| 2372 |
-
|
| 2373 |
-
|
| 2374 |
-
|
| 2375 |
-
info="If checked,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2376 |
|
| 2377 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
| 2378 |
# --- Preset dropdown for basic_pitch ---
|
|
@@ -2657,10 +2722,26 @@ if __name__ == "__main__":
|
|
| 2657 |
)
|
| 2658 |
|
| 2659 |
# Event listeners for UI visibility and presets
|
|
|
|
| 2660 |
separate_vocals.change(
|
| 2661 |
-
fn=
|
| 2662 |
inputs=separate_vocals,
|
| 2663 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2664 |
)
|
| 2665 |
|
| 2666 |
# --- Listeners for dynamic UI updates ---
|
|
|
|
| 104 |
# Global Settings
|
| 105 |
s8bit_preset_selector: str = "Custom"
|
| 106 |
separate_vocals: bool = False
|
| 107 |
+
|
| 108 |
+
# --- Advanced Separation and Merging Controls ---
|
| 109 |
+
enable_advanced_separation: bool = False # Controls visibility of advanced options
|
| 110 |
+
separate_drums: bool = True
|
| 111 |
+
separate_bass: bool = True
|
| 112 |
+
separate_other: bool = True
|
| 113 |
+
|
| 114 |
+
transcribe_vocals: bool = False
|
| 115 |
+
transcribe_drums: bool = False
|
| 116 |
+
transcribe_bass: bool = False
|
| 117 |
+
transcribe_other_or_accompaniment: bool = True # Default to transcribe 'other' as it's most common
|
| 118 |
+
|
| 119 |
+
merge_vocals_to_render: bool = False
|
| 120 |
+
merge_drums_to_render: bool = False
|
| 121 |
+
merge_bass_to_render: bool = False
|
| 122 |
+
merge_other_or_accompaniment: bool = False
|
| 123 |
+
|
| 124 |
enable_stereo_processing: bool = False
|
| 125 |
transcription_method: str = "General Purpose"
|
| 126 |
|
|
|
|
| 1347 |
|
| 1348 |
# --- Use the provided timestamp for unique filenames ---
|
| 1349 |
timestamped_base_name = f"{base_name}_{timestamp}"
|
| 1350 |
+
|
|
|
|
|
|
|
|
|
|
| 1351 |
|
| 1352 |
# --- Step 1: Check file type and transcribe if necessary ---
|
| 1353 |
if is_midi_input:
|
|
|
|
| 1396 |
print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}")
|
| 1397 |
return None # Return None to indicate failure
|
| 1398 |
|
| 1399 |
+
# --- Demucs Vocal Separation Logic ---
|
| 1400 |
+
# This block now handles multi-stem separation, transcription, and merging logic.
|
| 1401 |
+
separated_stems = {} # This will store the audio tensors for merging
|
| 1402 |
+
|
| 1403 |
+
if params.separate_vocals and demucs_model is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1404 |
# --- Vocal Separation Workflow ---
|
| 1405 |
+
update_progress(0.2, "Separating audio with Demucs...")
|
| 1406 |
+
# Convert to the format Demucs expects (e.g., 44.1kHz, stereo)
|
| 1407 |
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
|
| 1408 |
+
# Move tensor to GPU if available for faster processing
|
| 1409 |
if torch.cuda.is_available():
|
| 1410 |
audio_tensor = audio_tensor.cuda()
|
| 1411 |
+
|
| 1412 |
print("Separating audio with Demucs... This may take some time.")
|
| 1413 |
# --- Wrap the model call in a no_grad() context ---
|
| 1414 |
with torch.no_grad():
|
|
|
|
| 1416 |
demucs_model,
|
| 1417 |
audio_tensor[None], # The input shape is [batch, channels, samples]
|
| 1418 |
device='cuda' if torch.cuda.is_available() else 'cpu',
|
| 1419 |
+
progress=True
|
| 1420 |
)[0] # Remove the batch dimension from the output
|
| 1421 |
|
| 1422 |
# --- Clear CUDA cache immediately after use ---
|
| 1423 |
if torch.cuda.is_available():
|
| 1424 |
torch.cuda.empty_cache()
|
| 1425 |
print("CUDA cache cleared.")
|
| 1426 |
+
|
| 1427 |
+
sources = {name: stem for name, stem in zip(demucs_model.sources, all_stems)}
|
| 1428 |
+
|
| 1429 |
+
# --- Store original stems for potential re-merging ---
|
| 1430 |
+
for name, tensor in sources.items():
|
| 1431 |
+
separated_stems[name] = (tensor.cpu(), demucs_model.samplerate)
|
| 1432 |
+
|
| 1433 |
+
# --- Prepare Stems for Transcription ---
|
| 1434 |
+
stems_to_transcribe = {}
|
| 1435 |
+
if params.enable_advanced_separation:
|
| 1436 |
+
# User is in advanced mode, handle each stem individually
|
| 1437 |
+
if params.transcribe_vocals:
|
| 1438 |
+
stems_to_transcribe['vocals'] = sources['vocals']
|
| 1439 |
+
if params.transcribe_drums:
|
| 1440 |
+
stems_to_transcribe['drums'] = sources['drums']
|
| 1441 |
+
if params.transcribe_bass:
|
| 1442 |
+
stems_to_transcribe['bass'] = sources['bass']
|
| 1443 |
+
if params.transcribe_other_or_accompaniment:
|
| 1444 |
+
stems_to_transcribe['other'] = sources['other']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1445 |
else:
|
| 1446 |
+
# User is in simple mode, create a single 'accompaniment' stem
|
| 1447 |
+
accompaniment_tensor = sources['drums'] + sources['bass'] + sources['other']
|
| 1448 |
+
if params.transcribe_vocals:
|
| 1449 |
+
stems_to_transcribe['vocals'] = sources['vocals']
|
| 1450 |
+
if params.transcribe_other_or_accompaniment:
|
| 1451 |
+
stems_to_transcribe['accompaniment'] = accompaniment_tensor
|
| 1452 |
|
| 1453 |
+
# --- Transcribe Selected Stems ---
|
| 1454 |
+
transcribed_midi_paths = []
|
| 1455 |
+
if stems_to_transcribe:
|
| 1456 |
+
stem_count = len(stems_to_transcribe)
|
| 1457 |
+
for i, (name, tensor) in enumerate(stems_to_transcribe.items()):
|
| 1458 |
+
update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
|
| 1459 |
+
stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
|
| 1460 |
+
torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
|
| 1461 |
+
midi_path = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
|
| 1462 |
+
if midi_path:
|
| 1463 |
+
transcribed_midi_paths.append((name, midi_path))
|
| 1464 |
+
|
| 1465 |
+
# --- Merge Transcribed MIDIs ---
|
| 1466 |
+
if not transcribed_midi_paths:
|
| 1467 |
+
raise gr.Error("Separation was enabled, but no stems were selected for transcription, or transcription failed.")
|
| 1468 |
+
elif len(transcribed_midi_paths) == 1:
|
| 1469 |
+
midi_path_for_rendering = transcribed_midi_paths[0][1]
|
| 1470 |
+
else:
|
| 1471 |
+
update_progress(0.6, "Merging transcribed MIDIs...")
|
| 1472 |
+
merged_midi = pretty_midi.PrettyMIDI()
|
| 1473 |
+
for name, path in transcribed_midi_paths:
|
| 1474 |
+
try:
|
| 1475 |
+
midi_stem = pretty_midi.PrettyMIDI(path)
|
| 1476 |
+
for inst in midi_stem.instruments:
|
| 1477 |
+
inst.name = f"{name.capitalize()} - {inst.name}"
|
| 1478 |
+
merged_midi.instruments.append(inst)
|
| 1479 |
+
except Exception as e:
|
| 1480 |
+
print(f"Warning: Could not merge MIDI for stem {name}. Error: {e}")
|
| 1481 |
+
final_merged_midi_path = os.path.join(temp_dir, f"{timestamped_base_name}_full_transcription.mid")
|
| 1482 |
+
merged_midi.write(final_merged_midi_path)
|
| 1483 |
+
midi_path_for_rendering = final_merged_midi_path
|
| 1484 |
+
|
| 1485 |
+
else: # Standard workflow without separation
|
| 1486 |
+
# --- Standard Workflow: Transcribe the original full audio ---
|
| 1487 |
+
audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac")
|
| 1488 |
+
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
|
| 1489 |
+
|
| 1490 |
+
update_progress(0.2, "Transcribing audio to MIDI...")
|
| 1491 |
+
midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
|
| 1492 |
|
| 1493 |
if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
|
| 1494 |
print(f"ERROR: Transcription failed for {filename}. Skipping.")
|
| 1495 |
return None
|
| 1496 |
+
|
| 1497 |
# --- Step 2: Render the FINAL MIDI file with selected options ---
|
| 1498 |
# The progress values are now conditional based on the input file type.
|
| 1499 |
update_progress(0.1 if is_midi_input else 0.6, "Applying MIDI transformations...")
|
|
|
|
| 1516 |
except Exception as e:
|
| 1517 |
print(f"Could not auto-recommend parameters for {filename}: {e}.")
|
| 1518 |
|
| 1519 |
+
# --- Step 2: Render the FINAL MIDI file ---
|
| 1520 |
update_progress(0.2 if is_midi_input else 0.7, "Rendering MIDI to audio...")
|
| 1521 |
print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
|
| 1522 |
|
| 1523 |
# Call the rendering function, Pass dictionaries directly to Render_MIDI
|
| 1524 |
results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params)
|
| 1525 |
+
|
| 1526 |
+
# --- Final Audio Merging Logic ---
|
| 1527 |
+
stems_to_merge = []
|
| 1528 |
+
if params.separate_vocals:
|
| 1529 |
+
if params.merge_vocals_to_render and 'vocals' in separated_stems:
|
| 1530 |
+
stems_to_merge.append(separated_stems['vocals'])
|
| 1531 |
|
| 1532 |
+
if params.enable_advanced_separation:
|
| 1533 |
+
if params.merge_drums_to_render and 'drums' in separated_stems:
|
| 1534 |
+
stems_to_merge.append(separated_stems['drums'])
|
| 1535 |
+
if params.merge_bass_to_render and 'bass' in separated_stems:
|
| 1536 |
+
stems_to_merge.append(separated_stems['bass'])
|
| 1537 |
+
if params.merge_other_or_accompaniment and 'other' in separated_stems:
|
| 1538 |
+
stems_to_merge.append(separated_stems['other'])
|
| 1539 |
+
else: # Simple mode
|
| 1540 |
+
if params.merge_other_or_accompaniment: # 'other' checkbox now controls the whole accompaniment
|
| 1541 |
+
accompaniment_tensor = separated_stems['drums'][0] + separated_stems['bass'][0] + separated_stems['other'][0]
|
| 1542 |
+
stems_to_merge.append((accompaniment_tensor, demucs_model.samplerate))
|
| 1543 |
+
|
| 1544 |
+
if stems_to_merge:
|
| 1545 |
+
update_progress(0.9, "Re-merging audio stems...")
|
| 1546 |
rendered_srate, rendered_music_int16 = results_tuple[4]
|
|
|
|
|
|
|
| 1547 |
rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
|
| 1548 |
+
final_mix_tensor = torch.from_numpy(rendered_music_float).T
|
| 1549 |
+
final_srate = rendered_srate
|
| 1550 |
|
| 1551 |
+
for stem_tensor, stem_srate in stems_to_merge:
|
| 1552 |
+
# Resample if necessary
|
| 1553 |
+
if stem_srate != final_srate:
|
| 1554 |
+
# Resample all stems to match the rendered audio's sample rate
|
| 1555 |
+
resampler = torchaudio.transforms.Resample(stem_srate, final_srate)
|
| 1556 |
+
stem_tensor = resampler(stem_tensor)
|
| 1557 |
|
| 1558 |
+
# Pad and add to the final mix
|
| 1559 |
+
len_mix = final_mix_tensor.shape[1]
|
| 1560 |
+
len_stem = stem_tensor.shape[1]
|
| 1561 |
+
if len_mix > len_stem:
|
| 1562 |
+
stem_tensor = torch.nn.functional.pad(stem_tensor, (0, len_mix - len_stem))
|
| 1563 |
+
elif len_stem > len_mix:
|
| 1564 |
+
final_mix_tensor = torch.nn.functional.pad(final_mix_tensor, (0, len_stem - len_mix))
|
|
|
|
|
|
|
|
|
|
| 1565 |
|
| 1566 |
+
final_mix_tensor += stem_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1567 |
|
| 1568 |
+
# Normalize final mix to prevent clipping
|
| 1569 |
+
max_abs = torch.max(torch.abs(final_mix_tensor))
|
| 1570 |
+
if max_abs > 1.0: final_mix_tensor /= max_abs
|
| 1571 |
|
| 1572 |
+
# Convert back to the required format (int16 numpy array)
|
| 1573 |
+
merged_audio_int16 = (final_mix_tensor.T.numpy() * 32767).astype(np.int16)
|
| 1574 |
|
| 1575 |
+
# Update the results tuple with the newly merged audio
|
| 1576 |
temp_results_list = list(results_tuple)
|
| 1577 |
+
temp_results_list[4] = (final_srate, merged_audio_int16)
|
| 1578 |
results_tuple = tuple(temp_results_list) # results_tuple is now updated
|
| 1579 |
print("Re-merging complete.")
|
| 1580 |
+
|
| 1581 |
# --- Save final audio and return path ---
|
| 1582 |
+
update_progress(0.95, "Saving final files...")
|
| 1583 |
final_srate, final_audio_data = results_tuple[4]
|
| 1584 |
final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI
|
| 1585 |
|
|
|
|
| 1588 |
output_midi_dir = "output/final_midi"
|
| 1589 |
os.makedirs(output_audio_dir, exist_ok=True)
|
| 1590 |
os.makedirs(output_midi_dir, exist_ok=True)
|
| 1591 |
+
|
| 1592 |
final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac")
|
| 1593 |
# Also, copy the final processed MIDI to a consistent output directory with a timestamped name
|
| 1594 |
final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid")
|
|
|
|
| 2285 |
updates[component] = gr.update(value=value)
|
| 2286 |
|
| 2287 |
return updates
|
| 2288 |
+
|
| 2289 |
+
# --- UI Controller Function for Dynamic Visibility ---
|
| 2290 |
+
def update_separation_mode_ui(is_advanced):
|
| 2291 |
+
"""
|
| 2292 |
+
Updates the visibility and labels of UI components based on whether
|
| 2293 |
+
the advanced separation mode is enabled.
|
| 2294 |
+
"""
|
| 2295 |
+
if is_advanced:
|
| 2296 |
+
# Advanced Mode: Show individual controls, label becomes "Other"
|
| 2297 |
+
return {
|
| 2298 |
+
advanced_separation_controls: gr.update(visible=True),
|
| 2299 |
+
transcribe_drums: gr.update(visible=True),
|
| 2300 |
+
transcribe_bass: gr.update(visible=True),
|
| 2301 |
+
transcribe_other_or_accompaniment: gr.update(label="Transcribe Other"),
|
| 2302 |
+
merge_drums_to_render: gr.update(visible=True),
|
| 2303 |
+
merge_bass_to_render: gr.update(visible=True),
|
| 2304 |
+
merge_other_or_accompaniment: gr.update(label="Merge Other")
|
| 2305 |
+
}
|
| 2306 |
+
else:
|
| 2307 |
+
# Simple Mode: Hide individual controls, label becomes "Accompaniment"
|
| 2308 |
+
return {
|
| 2309 |
+
advanced_separation_controls: gr.update(visible=False),
|
| 2310 |
+
transcribe_drums: gr.update(visible=False),
|
| 2311 |
+
transcribe_bass: gr.update(visible=False),
|
| 2312 |
+
transcribe_other_or_accompaniment: gr.update(label="Transcribe Accompaniment"),
|
| 2313 |
+
merge_drums_to_render: gr.update(visible=False),
|
| 2314 |
+
merge_bass_to_render: gr.update(visible=False),
|
| 2315 |
+
merge_other_or_accompaniment: gr.update(label="Merge Accompaniment")
|
| 2316 |
+
}
|
| 2317 |
|
| 2318 |
# --- Use the dataclass to define the master list of parameter keys ---
|
| 2319 |
# This is now the single source of truth for parameter order.
|
|
|
|
| 2403 |
enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False,
|
| 2404 |
info="For stereo audio files only. When enabled, transcribes left and right channels independently, then merges them. Note: This will double the transcription time.")
|
| 2405 |
|
| 2406 |
+
# --- Vocal Separation Group ---
|
| 2407 |
with gr.Group():
|
| 2408 |
+
separate_vocals = gr.Checkbox(label="Enable Source Separation (Demucs)", value=False,
|
| 2409 |
+
info="If checked, separates the audio into its component stems (vocals, drums, etc.) before processing.")
|
| 2410 |
+
|
| 2411 |
+
# --- Container for all separation options, visible only when enabled ---
|
| 2412 |
+
with gr.Group(visible=False) as separation_options_box:
|
| 2413 |
+
gr.Markdown("#### 1. Stem Separation Options")
|
| 2414 |
+
enable_advanced_separation = gr.Checkbox(label="Enable Advanced Stem Control (for Accompaniment)", value=False,
|
| 2415 |
+
info="If checked, you can individually control drums, bass, and other. If unchecked, they are treated as a single 'Accompaniment' track.")
|
| 2416 |
+
|
| 2417 |
+
with gr.Row(visible=False) as advanced_separation_controls:
|
| 2418 |
+
separate_drums = gr.Checkbox(label="Drums", value=True)
|
| 2419 |
+
separate_bass = gr.Checkbox(label="Bass", value=True)
|
| 2420 |
+
separate_other = gr.Checkbox(label="Other", value=True)
|
| 2421 |
+
|
| 2422 |
+
gr.Markdown("#### 2. Transcription Targets")
|
| 2423 |
+
gr.Markdown("_Select which separated stem(s) to convert to MIDI._")
|
| 2424 |
+
with gr.Row():
|
| 2425 |
+
transcribe_vocals = gr.Checkbox(label="Transcribe Vocals", value=False)
|
| 2426 |
+
# These two will be hidden/shown dynamically
|
| 2427 |
+
transcribe_drums = gr.Checkbox(label="Transcribe Drums", value=False, visible=False)
|
| 2428 |
+
transcribe_bass = gr.Checkbox(label="Transcribe Bass", value=False, visible=False)
|
| 2429 |
+
# This checkbox will have its label changed dynamically
|
| 2430 |
+
transcribe_other_or_accompaniment = gr.Checkbox(label="Transcribe Accompaniment", value=True)
|
| 2431 |
+
|
| 2432 |
+
gr.Markdown("#### 3. Audio Merging Targets")
|
| 2433 |
+
gr.Markdown("_Select which original stem(s) to re-merge with the final rendered audio._")
|
| 2434 |
+
with gr.Row():
|
| 2435 |
+
merge_vocals_to_render = gr.Checkbox(label="Merge Vocals", value=False)
|
| 2436 |
+
# These two will be hidden/shown dynamically
|
| 2437 |
+
merge_drums_to_render = gr.Checkbox(label="Merge Drums", value=False, visible=False)
|
| 2438 |
+
merge_bass_to_render = gr.Checkbox(label="Merge Bass", value=False, visible=False)
|
| 2439 |
+
# This checkbox will have its label changed dynamically
|
| 2440 |
+
merge_other_or_accompaniment = gr.Checkbox(label="Merge Accompaniment", value=True)
|
| 2441 |
|
| 2442 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
| 2443 |
# --- Preset dropdown for basic_pitch ---
|
|
|
|
| 2722 |
)
|
| 2723 |
|
| 2724 |
# Event listeners for UI visibility and presets
|
| 2725 |
+
# When the main separation checkbox is toggled
|
| 2726 |
separate_vocals.change(
|
| 2727 |
+
fn=lambda x: gr.update(visible=x),
|
| 2728 |
inputs=separate_vocals,
|
| 2729 |
+
outputs=[separation_options_box]
|
| 2730 |
+
)
|
| 2731 |
+
|
| 2732 |
+
# When the advanced stem control checkbox is toggled, update all relevant UI parts
|
| 2733 |
+
enable_advanced_separation.change(
|
| 2734 |
+
fn=update_separation_mode_ui,
|
| 2735 |
+
inputs=enable_advanced_separation,
|
| 2736 |
+
outputs=[
|
| 2737 |
+
advanced_separation_controls,
|
| 2738 |
+
transcribe_drums,
|
| 2739 |
+
transcribe_bass,
|
| 2740 |
+
transcribe_other_or_accompaniment,
|
| 2741 |
+
merge_drums_to_render,
|
| 2742 |
+
merge_bass_to_render,
|
| 2743 |
+
merge_other_or_accompaniment
|
| 2744 |
+
]
|
| 2745 |
)
|
| 2746 |
|
| 2747 |
# --- Listeners for dynamic UI updates ---
|