Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -266,7 +266,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
|
|
266 |
except Exception as e:
|
267 |
return None, f"β Batch processing failed: {str(e)}"
|
268 |
|
269 |
-
# ===
|
270 |
whisper_model = WhisperModel("base")
|
271 |
|
272 |
def transcribe_audio(audio_path):
|
@@ -274,7 +274,7 @@ def transcribe_audio(audio_path):
|
|
274 |
text = " ".join([seg.text for seg in segments])
|
275 |
return text
|
276 |
|
277 |
-
# === TTS
|
278 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
279 |
|
280 |
def generate_tts(text):
|
@@ -282,6 +282,23 @@ def generate_tts(text):
|
|
282 |
tts.tts_to_file(text=text, file_path=out_path)
|
283 |
return out_path
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
# === Trim Silence Automatically (VAD) ===
|
286 |
def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
|
287 |
audio = AudioSegment.from_file(audio_file)
|
@@ -309,42 +326,53 @@ def mix_tracks(track1, track2, volume_offset=0):
|
|
309 |
mixed.export(out_path, format="wav")
|
310 |
return out_path
|
311 |
|
312 |
-
# ===
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
"preset": preset_name,
|
317 |
-
"effects": effects
|
318 |
-
}
|
319 |
-
out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
|
320 |
-
with open(out_path, "wb") as f:
|
321 |
-
pickle.dump(project_data, f)
|
322 |
-
return out_path
|
323 |
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
elif action == "load" and isinstance(audio, dict):
|
334 |
-
return (
|
335 |
-
None,
|
336 |
-
audio.get("audio"),
|
337 |
-
audio.get("preset"),
|
338 |
-
audio.get("effects")
|
339 |
-
)
|
340 |
-
return None, None, None, None
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
return generate_tts(text)
|
346 |
|
347 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
effect_options = [
|
349 |
"Noise Reduction",
|
350 |
"Compress Dynamic Range",
|
@@ -424,17 +452,17 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
424 |
clear_btn=None
|
425 |
)
|
426 |
|
427 |
-
# --- Transcribe & Edit
|
428 |
with gr.Tab("π Transcribe & Edit"):
|
429 |
gr.Interface(
|
430 |
fn=transcribe_audio,
|
431 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
432 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
433 |
-
title="Transcribe Spoken Content",
|
434 |
description="Convert voice to text and edit it before exporting again."
|
435 |
)
|
436 |
|
437 |
-
# --- TTS Voice Generator
|
438 |
with gr.Tab("π¬ TTS Voice Generator"):
|
439 |
gr.Interface(
|
440 |
fn=generate_tts,
|
@@ -444,7 +472,52 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
444 |
description="Type anything and turn it into natural-sounding speech."
|
445 |
)
|
446 |
|
447 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
with gr.Tab("βοΈ Trim Silence Automatically"):
|
449 |
gr.Interface(
|
450 |
fn=detect_silence,
|
@@ -483,28 +556,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
483 |
description="Load your saved session"
|
484 |
)
|
485 |
|
486 |
-
# --- Auto-Save / Resume Sessions ===
|
487 |
-
session_state = gr.State()
|
488 |
-
|
489 |
-
with gr.Tab("π§Ύ Auto-Save & Resume"):
|
490 |
-
gr.Markdown("Save your current state and resume editing later.")
|
491 |
-
|
492 |
-
action_radio = gr.Radio(["save", "load"], label="Action", value="save")
|
493 |
-
audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
|
494 |
-
preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
|
495 |
-
effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
|
496 |
-
save_btn = gr.Button("Save or Load Session")
|
497 |
-
|
498 |
-
loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
|
499 |
-
loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
|
500 |
-
loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
|
501 |
-
|
502 |
-
save_btn.click(
|
503 |
-
fn=save_or_resume_session,
|
504 |
-
inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
|
505 |
-
outputs=[session_state, loaded_audio, loaded_preset, loaded_effects]
|
506 |
-
)
|
507 |
-
|
508 |
# --- Mix Two Tracks ===
|
509 |
with gr.Tab("π Mix Two Tracks"):
|
510 |
gr.Interface(
|
@@ -516,37 +567,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
516 |
],
|
517 |
outputs=gr.File(label="Mixed Output"),
|
518 |
title="Overlay Two Tracks",
|
519 |
-
description="Mix
|
520 |
-
)
|
521 |
-
|
522 |
-
# === Voice Style Transfer (Dummy) ===
|
523 |
-
def apply_style_transfer(audio_path, mood="Happy"):
|
524 |
-
return audio_path
|
525 |
-
|
526 |
-
with gr.Tab("π§ Voice Style Transfer"):
|
527 |
-
gr.Interface(
|
528 |
-
fn=apply_style_transfer,
|
529 |
-
inputs=[
|
530 |
-
gr.Audio(label="Upload Voice Clip", type="filepath"),
|
531 |
-
gr.Radio(["Happy", "Sad", "Angry", "Calm"], label="Choose Tone")
|
532 |
-
],
|
533 |
-
outputs=gr.Audio(label="Stylized Output", type="filepath"),
|
534 |
-
title="Change Emotional Tone of Voice",
|
535 |
-
description="Shift the emotional style of any voice clip."
|
536 |
-
)
|
537 |
-
|
538 |
-
# --- Voice Cloning (Fallback) ===
|
539 |
-
with gr.Tab("π Voice Cloning (Demo)"):
|
540 |
-
gr.Interface(
|
541 |
-
fn=clone_voice,
|
542 |
-
inputs=[
|
543 |
-
gr.File(label="Source Voice Clip"),
|
544 |
-
gr.File(label="Target Voice Clip"),
|
545 |
-
gr.Textbox(label="Text to Clone", lines=5)
|
546 |
-
],
|
547 |
-
outputs=gr.Audio(label="Cloned Output", type="filepath"),
|
548 |
-
title="Replace One Voice With Another (Demo)",
|
549 |
-
description="Clone voice from source to target speaker using AI"
|
550 |
)
|
551 |
|
552 |
demo.launch()
|
|
|
266 |
except Exception as e:
|
267 |
return None, f"β Batch processing failed: {str(e)}"
|
268 |
|
269 |
+
# === Whisper Transcription Tab ===
|
270 |
whisper_model = WhisperModel("base")
|
271 |
|
272 |
def transcribe_audio(audio_path):
|
|
|
274 |
text = " ".join([seg.text for seg in segments])
|
275 |
return text
|
276 |
|
277 |
+
# === TTS Voice Generator ===
|
278 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
279 |
|
280 |
def generate_tts(text):
|
|
|
282 |
tts.tts_to_file(text=text, file_path=out_path)
|
283 |
return out_path
|
284 |
|
285 |
+
# === Save/Load Project File (.aiproj) ===
|
286 |
+
def save_project(audio_path, preset_name, effects):
|
287 |
+
project_data = {
|
288 |
+
"audio": AudioSegment.from_file(audio_path).raw_data,
|
289 |
+
"preset": preset_name,
|
290 |
+
"effects": effects
|
291 |
+
}
|
292 |
+
out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
|
293 |
+
with open(out_path, "wb") as f:
|
294 |
+
pickle.dump(project_data, f)
|
295 |
+
return out_path
|
296 |
+
|
297 |
+
def load_project(project_file):
|
298 |
+
with open(project_file.name, "rb") as f:
|
299 |
+
data = pickle.load(f)
|
300 |
+
return data["preset"], data["effects"]
|
301 |
+
|
302 |
# === Trim Silence Automatically (VAD) ===
|
303 |
def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
|
304 |
audio = AudioSegment.from_file(audio_file)
|
|
|
326 |
mixed.export(out_path, format="wav")
|
327 |
return out_path
|
328 |
|
329 |
+
# === Speaker Diarization ("Who Spoke When?") ===
|
330 |
+
try:
|
331 |
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
332 |
+
from huggingface_hub import login
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
+
hf_token = os.getenv("HF_TOKEN")
|
335 |
+
if hf_token:
|
336 |
+
login(token=hf_token)
|
337 |
+
else:
|
338 |
+
print("β οΈ HF_TOKEN not set β speaker diarization disabled")
|
339 |
|
340 |
+
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
|
341 |
+
except ImportError:
|
342 |
+
diarize_pipeline = None
|
343 |
+
print("β οΈ PyAnnote not installed β speaker diarization disabled")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
+
def diarize_and_transcribe(audio_path):
|
346 |
+
if diarize_pipeline is None:
|
347 |
+
return "β οΈ Diarization pipeline not loaded β check HF token or install pyannote.audio"
|
|
|
348 |
|
349 |
+
# Run diarization
|
350 |
+
audio = AudioSegment.from_file(audio_path)
|
351 |
+
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
352 |
+
audio.export(temp_wav, format="wav")
|
353 |
+
|
354 |
+
try:
|
355 |
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
356 |
+
diarization = diarize_pipeline(temp_wav)
|
357 |
+
|
358 |
+
# Run transcription
|
359 |
+
result = whisper.transcribe(temp_wav)
|
360 |
+
|
361 |
+
segments = []
|
362 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
363 |
+
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
364 |
+
segments.append({
|
365 |
+
"speaker": speaker,
|
366 |
+
"start": turn.start,
|
367 |
+
"end": turn.end,
|
368 |
+
"text": text
|
369 |
+
})
|
370 |
+
|
371 |
+
return segments
|
372 |
+
except Exception as e:
|
373 |
+
return f"β οΈ Diarization failed: {str(e)}"
|
374 |
+
|
375 |
+
# === UI ===
|
376 |
effect_options = [
|
377 |
"Noise Reduction",
|
378 |
"Compress Dynamic Range",
|
|
|
452 |
clear_btn=None
|
453 |
)
|
454 |
|
455 |
+
# --- Transcribe & Edit Tab ===
|
456 |
with gr.Tab("π Transcribe & Edit"):
|
457 |
gr.Interface(
|
458 |
fn=transcribe_audio,
|
459 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
460 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
461 |
+
title="Transcribe & Edit Spoken Content",
|
462 |
description="Convert voice to text and edit it before exporting again."
|
463 |
)
|
464 |
|
465 |
+
# --- TTS Voice Generator ===
|
466 |
with gr.Tab("π¬ TTS Voice Generator"):
|
467 |
gr.Interface(
|
468 |
fn=generate_tts,
|
|
|
472 |
description="Type anything and turn it into natural-sounding speech."
|
473 |
)
|
474 |
|
475 |
+
# --- Speaker Diarization (Who Spoke When?) ===
|
476 |
+
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
477 |
+
gr.Interface(
|
478 |
+
fn=diarize_and_transcribe,
|
479 |
+
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
480 |
+
outputs=gr.JSON(label="Diarized Transcript"),
|
481 |
+
title="Split By Speaker + Transcribe",
|
482 |
+
description="Detect speakers and transcribe their speech automatically."
|
483 |
+
)
|
484 |
+
|
485 |
+
# --- Auto-Save / Resume Sessions ===
|
486 |
+
session_state = gr.State()
|
487 |
+
|
488 |
+
def save_or_resume_session(audio, preset, effects, action="save"):
|
489 |
+
if action == "save":
|
490 |
+
return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
|
491 |
+
elif action == "load" and isinstance(audio, dict):
|
492 |
+
return (
|
493 |
+
None,
|
494 |
+
audio.get("audio"),
|
495 |
+
audio.get("preset"),
|
496 |
+
audio.get("effects")
|
497 |
+
)
|
498 |
+
return None, None, None, None
|
499 |
+
|
500 |
+
with gr.Tab("π§Ύ Auto-Save & Resume"):
|
501 |
+
gr.Markdown("Save your current state and resume later.")
|
502 |
+
|
503 |
+
action_radio = gr.Radio(["save", "load"], label="Action", value="save")
|
504 |
+
audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
|
505 |
+
preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
|
506 |
+
effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
|
507 |
+
action_btn = gr.Button("Save or Load Session")
|
508 |
+
|
509 |
+
session_data = gr.State()
|
510 |
+
loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
|
511 |
+
loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
|
512 |
+
loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
|
513 |
+
|
514 |
+
action_btn.click(
|
515 |
+
fn=save_or_resume_session,
|
516 |
+
inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
|
517 |
+
outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
|
518 |
+
)
|
519 |
+
|
520 |
+
# --- Trim Silence Automatically (VAD) ===
|
521 |
with gr.Tab("βοΈ Trim Silence Automatically"):
|
522 |
gr.Interface(
|
523 |
fn=detect_silence,
|
|
|
556 |
description="Load your saved session"
|
557 |
)
|
558 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
# --- Mix Two Tracks ===
|
560 |
with gr.Tab("π Mix Two Tracks"):
|
561 |
gr.Interface(
|
|
|
567 |
],
|
568 |
outputs=gr.File(label="Mixed Output"),
|
569 |
title="Overlay Two Tracks",
|
570 |
+
description="Mix or subtract two audio files."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
)
|
572 |
|
573 |
demo.launch()
|