Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -21,8 +21,12 @@ import warnings
|
|
21 |
from faster_whisper import WhisperModel
|
22 |
from mutagen.mp3 import MP3
|
23 |
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
# Suppress warnings
|
26 |
warnings.filterwarnings("ignore")
|
27 |
|
28 |
# === Helper Functions ===
|
@@ -264,70 +268,57 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
|
|
264 |
except Exception as e:
|
265 |
return None, f"β Batch processing failed: {str(e)}"
|
266 |
|
267 |
-
# ===
|
268 |
-
whisper_model = WhisperModel("base")
|
269 |
-
|
270 |
-
def transcribe_audio(audio_path):
|
271 |
-
segments, info = whisper_model.transcribe(audio_path, beam_size=5)
|
272 |
-
text = " ".join([seg.text for seg in segments])
|
273 |
-
return text
|
274 |
-
|
275 |
-
# === TTS Tab ===
|
276 |
-
from TTS.api import TTS
|
277 |
-
|
278 |
-
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
279 |
|
280 |
-
|
281 |
-
|
282 |
-
tts.tts_to_file(text=text, file_path=out_path)
|
283 |
-
return out_path
|
284 |
-
|
285 |
-
# === Analyze Audio Stats ===
|
286 |
-
def analyze_audio(audio_path):
|
287 |
-
y, sr = torchaudio.load(audio_path)
|
288 |
-
rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
|
289 |
-
tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
|
290 |
-
silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
|
291 |
-
|
292 |
-
plt.figure(figsize=(10, 4))
|
293 |
-
plt.plot(y.numpy().flatten(), color="lightblue")
|
294 |
-
plt.title("Loudness Over Time")
|
295 |
-
plt.tight_layout()
|
296 |
-
buf = BytesIO()
|
297 |
-
plt.savefig(buf, format="png")
|
298 |
-
plt.close()
|
299 |
-
image = Image.open(buf)
|
300 |
-
|
301 |
-
stats = {
|
302 |
-
"rms_loudness": float(rms),
|
303 |
-
"silence_ratio": float(silence_ratio),
|
304 |
-
"tempo_bpm": int(tempo)
|
305 |
-
}
|
306 |
|
307 |
-
|
|
|
|
|
308 |
|
309 |
-
# ===
|
310 |
-
|
311 |
-
a1 = AudioSegment.from_file(track1)
|
312 |
-
a2 = AudioSegment.from_file(track2)
|
313 |
-
mixed = a1.overlay(a2 - volume_offset)
|
314 |
-
out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
|
315 |
-
mixed.export(out_path, format="wav")
|
316 |
-
return out_path
|
317 |
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
return out_path
|
329 |
|
330 |
-
# UI
|
331 |
effect_options = [
|
332 |
"Noise Reduction",
|
333 |
"Compress Dynamic Range",
|
@@ -407,17 +398,41 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
407 |
clear_btn=None
|
408 |
)
|
409 |
|
410 |
-
# --- Transcribe & Edit
|
411 |
with gr.Tab("π Transcribe & Edit"):
|
412 |
gr.Interface(
|
413 |
fn=transcribe_audio,
|
414 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
415 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
416 |
title="Transcribe & Edit Spoken Content",
|
417 |
-
description="Convert voice to text
|
418 |
)
|
419 |
|
420 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
with gr.Tab("π¬ TTS Voice Generator"):
|
422 |
gr.Interface(
|
423 |
fn=generate_tts,
|
@@ -427,7 +442,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
427 |
description="Type anything and turn it into natural-sounding speech."
|
428 |
)
|
429 |
|
430 |
-
# --- Audio Analysis Dashboard
|
431 |
with gr.Tab("π Audio Analysis"):
|
432 |
gr.Interface(
|
433 |
fn=analyze_audio,
|
@@ -440,32 +455,4 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
440 |
description="Analyze audio loudness, tempo, and frequency content."
|
441 |
)
|
442 |
|
443 |
-
# --- Mix Two Tracks ---
|
444 |
-
with gr.Tab("π Mix Two Tracks"):
|
445 |
-
gr.Interface(
|
446 |
-
fn=mix_tracks,
|
447 |
-
inputs=[
|
448 |
-
gr.File(label="Main Track"),
|
449 |
-
gr.File(label="Background Track"),
|
450 |
-
gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
|
451 |
-
],
|
452 |
-
outputs=gr.File(label="Mixed Output"),
|
453 |
-
title="Overlay Two Tracks",
|
454 |
-
description="Mix or subtract two audio files."
|
455 |
-
)
|
456 |
-
|
457 |
-
# --- Load/Save Project ---
|
458 |
-
with gr.Tab("π Save/Load Project"):
|
459 |
-
gr.Interface(
|
460 |
-
fn=save_project,
|
461 |
-
inputs=[
|
462 |
-
gr.File(label="Original Audio"),
|
463 |
-
gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
|
464 |
-
gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
|
465 |
-
],
|
466 |
-
outputs=gr.File(label="Project File (.aiproj)"),
|
467 |
-
title="Save Everything Together",
|
468 |
-
description="Save your session, effects, and settings in one file to reuse later."
|
469 |
-
)
|
470 |
-
|
471 |
demo.launch()
|
|
|
21 |
from faster_whisper import WhisperModel
|
22 |
from mutagen.mp3 import MP3
|
23 |
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
24 |
+
import whisper
|
25 |
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
26 |
+
from openvoice.api import TTS, ToneColorConverter
|
27 |
+
from openvoice.se_extractor import get_se
|
28 |
|
29 |
+
# Suppress warnings
|
30 |
warnings.filterwarnings("ignore")
|
31 |
|
32 |
# === Helper Functions ===
|
|
|
268 |
except Exception as e:
|
269 |
return None, f"β Batch processing failed: {str(e)}"
|
270 |
|
271 |
+
# === Load Models Once at Start ===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
+
# π§ Speaker Diarization Model
|
274 |
+
diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
+
# π€ OpenVoice TTS + Converter
|
277 |
+
tts_model = TTS(lang='en')
|
278 |
+
tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
|
279 |
|
280 |
+
# === Transcribe & Diarize Tab ===
|
281 |
+
whisper_model = WhisperModel("base")
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
+
def diarize_and_transcribe(audio_path):
|
284 |
+
# Run diarization
|
285 |
+
audio = AudioSegment.from_file(audio_path)
|
286 |
+
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
287 |
+
audio.export(temp_wav, format="wav")
|
288 |
+
diarization = diarize_model(temp_wav)
|
289 |
+
|
290 |
+
# Run transcription
|
291 |
+
result = whisper.transcribe(temp_wav)
|
292 |
+
|
293 |
+
segments = []
|
294 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
295 |
+
text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
296 |
+
segments.append({
|
297 |
+
"speaker": speaker,
|
298 |
+
"start": turn.start,
|
299 |
+
"end": turn.end,
|
300 |
+
"text": text
|
301 |
+
})
|
302 |
+
|
303 |
+
return segments
|
304 |
+
|
305 |
+
# === Voice Cloning (Dubbing) ===
|
306 |
+
def clone_voice(source_audio, target_audio, text):
|
307 |
+
source_se, _ = get_se(source_audio)
|
308 |
+
target_se, _ = get_se(target_audio)
|
309 |
+
|
310 |
+
out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
|
311 |
+
|
312 |
+
tts_model.tts_to_file(text=text, file_path=out_path)
|
313 |
+
tone_converter.convert(
|
314 |
+
audio_src_path=out_path,
|
315 |
+
src_se=source_se,
|
316 |
+
tgt_se=target_se,
|
317 |
+
output_path=out_path
|
318 |
+
)
|
319 |
return out_path
|
320 |
|
321 |
+
# === UI ===
|
322 |
effect_options = [
|
323 |
"Noise Reduction",
|
324 |
"Compress Dynamic Range",
|
|
|
398 |
clear_btn=None
|
399 |
)
|
400 |
|
401 |
+
# --- Transcribe & Edit ===
|
402 |
with gr.Tab("π Transcribe & Edit"):
|
403 |
gr.Interface(
|
404 |
fn=transcribe_audio,
|
405 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
406 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
407 |
title="Transcribe & Edit Spoken Content",
|
408 |
+
description="Convert voice to text and edit it before exporting again."
|
409 |
)
|
410 |
|
411 |
+
# --- Speaker Diarization ===
|
412 |
+
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
413 |
+
gr.Interface(
|
414 |
+
fn=diarize_and_transcribe,
|
415 |
+
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
416 |
+
outputs=gr.JSON(label="Diarized Transcript"),
|
417 |
+
title="Split By Speaker + Transcribe",
|
418 |
+
description="Detect speakers and transcribe their speech automatically."
|
419 |
+
)
|
420 |
+
|
421 |
+
# --- Voice Cloning (Dubbing) ===
|
422 |
+
with gr.Tab("π Voice Cloning (Dubbing)"):
|
423 |
+
gr.Interface(
|
424 |
+
fn=clone_voice,
|
425 |
+
inputs=[
|
426 |
+
gr.File(label="Source Voice Clip"),
|
427 |
+
gr.File(label="Target Voice Clip"),
|
428 |
+
gr.Textbox(label="Text to Clone", lines=5)
|
429 |
+
],
|
430 |
+
outputs=gr.Audio(label="Cloned Output", type="filepath"),
|
431 |
+
title="Replace One Voice With Another",
|
432 |
+
description="Clone voice from source to target speaker using AI"
|
433 |
+
)
|
434 |
+
|
435 |
+
# --- TTS Voice Generator ===
|
436 |
with gr.Tab("π¬ TTS Voice Generator"):
|
437 |
gr.Interface(
|
438 |
fn=generate_tts,
|
|
|
442 |
description="Type anything and turn it into natural-sounding speech."
|
443 |
)
|
444 |
|
445 |
+
# --- Audio Analysis Dashboard ===
|
446 |
with gr.Tab("π Audio Analysis"):
|
447 |
gr.Interface(
|
448 |
fn=analyze_audio,
|
|
|
455 |
description="Analyze audio loudness, tempo, and frequency content."
|
456 |
)
|
457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
demo.launch()
|