Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,8 +21,12 @@ import warnings
|
|
| 21 |
from faster_whisper import WhisperModel
|
| 22 |
from mutagen.mp3 import MP3
|
| 23 |
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
# Suppress warnings
|
| 26 |
warnings.filterwarnings("ignore")
|
| 27 |
|
| 28 |
# === Helper Functions ===
|
|
@@ -264,70 +268,57 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
|
|
| 264 |
except Exception as e:
|
| 265 |
return None, f"β Batch processing failed: {str(e)}"
|
| 266 |
|
| 267 |
-
# ===
|
| 268 |
-
whisper_model = WhisperModel("base")
|
| 269 |
-
|
| 270 |
-
def transcribe_audio(audio_path):
|
| 271 |
-
segments, info = whisper_model.transcribe(audio_path, beam_size=5)
|
| 272 |
-
text = " ".join([seg.text for seg in segments])
|
| 273 |
-
return text
|
| 274 |
-
|
| 275 |
-
# === TTS Tab ===
|
| 276 |
-
from TTS.api import TTS
|
| 277 |
-
|
| 278 |
-
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
| 279 |
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
tts.tts_to_file(text=text, file_path=out_path)
|
| 283 |
-
return out_path
|
| 284 |
-
|
| 285 |
-
# === Analyze Audio Stats ===
|
| 286 |
-
def analyze_audio(audio_path):
|
| 287 |
-
y, sr = torchaudio.load(audio_path)
|
| 288 |
-
rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
|
| 289 |
-
tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
|
| 290 |
-
silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
|
| 291 |
-
|
| 292 |
-
plt.figure(figsize=(10, 4))
|
| 293 |
-
plt.plot(y.numpy().flatten(), color="lightblue")
|
| 294 |
-
plt.title("Loudness Over Time")
|
| 295 |
-
plt.tight_layout()
|
| 296 |
-
buf = BytesIO()
|
| 297 |
-
plt.savefig(buf, format="png")
|
| 298 |
-
plt.close()
|
| 299 |
-
image = Image.open(buf)
|
| 300 |
-
|
| 301 |
-
stats = {
|
| 302 |
-
"rms_loudness": float(rms),
|
| 303 |
-
"silence_ratio": float(silence_ratio),
|
| 304 |
-
"tempo_bpm": int(tempo)
|
| 305 |
-
}
|
| 306 |
|
| 307 |
-
|
|
|
|
|
|
|
| 308 |
|
| 309 |
-
# ===
|
| 310 |
-
|
| 311 |
-
a1 = AudioSegment.from_file(track1)
|
| 312 |
-
a2 = AudioSegment.from_file(track2)
|
| 313 |
-
mixed = a1.overlay(a2 - volume_offset)
|
| 314 |
-
out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
|
| 315 |
-
mixed.export(out_path, format="wav")
|
| 316 |
-
return out_path
|
| 317 |
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
return out_path
|
| 329 |
|
| 330 |
-
# UI
|
| 331 |
effect_options = [
|
| 332 |
"Noise Reduction",
|
| 333 |
"Compress Dynamic Range",
|
|
@@ -407,17 +398,41 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 407 |
clear_btn=None
|
| 408 |
)
|
| 409 |
|
| 410 |
-
# --- Transcribe & Edit
|
| 411 |
with gr.Tab("π Transcribe & Edit"):
|
| 412 |
gr.Interface(
|
| 413 |
fn=transcribe_audio,
|
| 414 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
| 415 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
| 416 |
title="Transcribe & Edit Spoken Content",
|
| 417 |
-
description="Convert voice to text
|
| 418 |
)
|
| 419 |
|
| 420 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
with gr.Tab("π¬ TTS Voice Generator"):
|
| 422 |
gr.Interface(
|
| 423 |
fn=generate_tts,
|
|
@@ -427,7 +442,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 427 |
description="Type anything and turn it into natural-sounding speech."
|
| 428 |
)
|
| 429 |
|
| 430 |
-
# --- Audio Analysis Dashboard
|
| 431 |
with gr.Tab("π Audio Analysis"):
|
| 432 |
gr.Interface(
|
| 433 |
fn=analyze_audio,
|
|
@@ -440,32 +455,4 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 440 |
description="Analyze audio loudness, tempo, and frequency content."
|
| 441 |
)
|
| 442 |
|
| 443 |
-
# --- Mix Two Tracks ---
|
| 444 |
-
with gr.Tab("π Mix Two Tracks"):
|
| 445 |
-
gr.Interface(
|
| 446 |
-
fn=mix_tracks,
|
| 447 |
-
inputs=[
|
| 448 |
-
gr.File(label="Main Track"),
|
| 449 |
-
gr.File(label="Background Track"),
|
| 450 |
-
gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
|
| 451 |
-
],
|
| 452 |
-
outputs=gr.File(label="Mixed Output"),
|
| 453 |
-
title="Overlay Two Tracks",
|
| 454 |
-
description="Mix or subtract two audio files."
|
| 455 |
-
)
|
| 456 |
-
|
| 457 |
-
# --- Load/Save Project ---
|
| 458 |
-
with gr.Tab("π Save/Load Project"):
|
| 459 |
-
gr.Interface(
|
| 460 |
-
fn=save_project,
|
| 461 |
-
inputs=[
|
| 462 |
-
gr.File(label="Original Audio"),
|
| 463 |
-
gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
|
| 464 |
-
gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
|
| 465 |
-
],
|
| 466 |
-
outputs=gr.File(label="Project File (.aiproj)"),
|
| 467 |
-
title="Save Everything Together",
|
| 468 |
-
description="Save your session, effects, and settings in one file to reuse later."
|
| 469 |
-
)
|
| 470 |
-
|
| 471 |
demo.launch()
|
|
|
|
| 21 |
from faster_whisper import WhisperModel
|
| 22 |
from mutagen.mp3 import MP3
|
| 23 |
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
| 24 |
+
import whisper
|
| 25 |
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
| 26 |
+
from openvoice.api import TTS, ToneColorConverter
|
| 27 |
+
from openvoice.se_extractor import get_se
|
| 28 |
|
| 29 |
+
# Suppress warnings
|
| 30 |
warnings.filterwarnings("ignore")
|
| 31 |
|
| 32 |
# === Helper Functions ===
|
|
|
|
| 268 |
except Exception as e:
|
| 269 |
return None, f"β Batch processing failed: {str(e)}"
|
| 270 |
|
| 271 |
+
# === Load Models Once at Start ===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
# π§ Speaker Diarization Model
|
| 274 |
+
diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
+
# π€ OpenVoice TTS + Converter
|
| 277 |
+
tts_model = TTS(lang='en')
|
| 278 |
+
tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
|
| 279 |
|
| 280 |
+
# === Transcribe & Diarize Tab ===
|
| 281 |
+
whisper_model = WhisperModel("base")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
+
def diarize_and_transcribe(audio_path):
|
| 284 |
+
# Run diarization
|
| 285 |
+
audio = AudioSegment.from_file(audio_path)
|
| 286 |
+
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
| 287 |
+
audio.export(temp_wav, format="wav")
|
| 288 |
+
diarization = diarize_model(temp_wav)
|
| 289 |
+
|
| 290 |
+
# Run transcription
|
| 291 |
+
result = whisper.transcribe(temp_wav)
|
| 292 |
+
|
| 293 |
+
segments = []
|
| 294 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 295 |
+
text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
| 296 |
+
segments.append({
|
| 297 |
+
"speaker": speaker,
|
| 298 |
+
"start": turn.start,
|
| 299 |
+
"end": turn.end,
|
| 300 |
+
"text": text
|
| 301 |
+
})
|
| 302 |
+
|
| 303 |
+
return segments
|
| 304 |
+
|
| 305 |
+
# === Voice Cloning (Dubbing) ===
|
| 306 |
+
def clone_voice(source_audio, target_audio, text):
|
| 307 |
+
source_se, _ = get_se(source_audio)
|
| 308 |
+
target_se, _ = get_se(target_audio)
|
| 309 |
+
|
| 310 |
+
out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
|
| 311 |
+
|
| 312 |
+
tts_model.tts_to_file(text=text, file_path=out_path)
|
| 313 |
+
tone_converter.convert(
|
| 314 |
+
audio_src_path=out_path,
|
| 315 |
+
src_se=source_se,
|
| 316 |
+
tgt_se=target_se,
|
| 317 |
+
output_path=out_path
|
| 318 |
+
)
|
| 319 |
return out_path
|
| 320 |
|
| 321 |
+
# === UI ===
|
| 322 |
effect_options = [
|
| 323 |
"Noise Reduction",
|
| 324 |
"Compress Dynamic Range",
|
|
|
|
| 398 |
clear_btn=None
|
| 399 |
)
|
| 400 |
|
| 401 |
+
# --- Transcribe & Edit ===
|
| 402 |
with gr.Tab("π Transcribe & Edit"):
|
| 403 |
gr.Interface(
|
| 404 |
fn=transcribe_audio,
|
| 405 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
| 406 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
| 407 |
title="Transcribe & Edit Spoken Content",
|
| 408 |
+
description="Convert voice to text and edit it before exporting again."
|
| 409 |
)
|
| 410 |
|
| 411 |
+
# --- Speaker Diarization ===
|
| 412 |
+
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
| 413 |
+
gr.Interface(
|
| 414 |
+
fn=diarize_and_transcribe,
|
| 415 |
+
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
| 416 |
+
outputs=gr.JSON(label="Diarized Transcript"),
|
| 417 |
+
title="Split By Speaker + Transcribe",
|
| 418 |
+
description="Detect speakers and transcribe their speech automatically."
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
# --- Voice Cloning (Dubbing) ===
|
| 422 |
+
with gr.Tab("π Voice Cloning (Dubbing)"):
|
| 423 |
+
gr.Interface(
|
| 424 |
+
fn=clone_voice,
|
| 425 |
+
inputs=[
|
| 426 |
+
gr.File(label="Source Voice Clip"),
|
| 427 |
+
gr.File(label="Target Voice Clip"),
|
| 428 |
+
gr.Textbox(label="Text to Clone", lines=5)
|
| 429 |
+
],
|
| 430 |
+
outputs=gr.Audio(label="Cloned Output", type="filepath"),
|
| 431 |
+
title="Replace One Voice With Another",
|
| 432 |
+
description="Clone voice from source to target speaker using AI"
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
# --- TTS Voice Generator ===
|
| 436 |
with gr.Tab("π¬ TTS Voice Generator"):
|
| 437 |
gr.Interface(
|
| 438 |
fn=generate_tts,
|
|
|
|
| 442 |
description="Type anything and turn it into natural-sounding speech."
|
| 443 |
)
|
| 444 |
|
| 445 |
+
# --- Audio Analysis Dashboard ===
|
| 446 |
with gr.Tab("π Audio Analysis"):
|
| 447 |
gr.Interface(
|
| 448 |
fn=analyze_audio,
|
|
|
|
| 455 |
description="Analyze audio loudness, tempo, and frequency content."
|
| 456 |
)
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
demo.launch()
|