Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import numpy as np
|
|
4 |
import tempfile
|
5 |
import os
|
6 |
import noisereduce as nr
|
|
|
7 |
import torch
|
8 |
from demucs import pretrained
|
9 |
from demucs.apply import apply_model
|
@@ -12,13 +13,12 @@ from pathlib import Path
|
|
12 |
import matplotlib.pyplot as plt
|
13 |
from io import BytesIO
|
14 |
from PIL import Image
|
15 |
-
import
|
16 |
-
from faster_whisper import WhisperModel
|
17 |
-
import json
|
18 |
import datetime
|
19 |
import librosa
|
20 |
import joblib
|
21 |
import warnings
|
|
|
22 |
from mutagen.mp3 import MP3
|
23 |
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
24 |
|
@@ -121,11 +121,11 @@ def stem_split(audio_path):
|
|
121 |
for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
|
122 |
path = os.path.join(output_dir, f"{name}.wav")
|
123 |
save_track(path, sources[i].cpu(), model.samplerate)
|
124 |
-
stem_paths.append(path)
|
125 |
|
126 |
-
return
|
127 |
|
128 |
-
# ===
|
129 |
def load_presets():
|
130 |
try:
|
131 |
preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
|
@@ -155,7 +155,7 @@ if not preset_choices:
|
|
155 |
|
156 |
preset_names = list(preset_choices.keys())
|
157 |
|
158 |
-
# === Waveform Generator ===
|
159 |
def show_waveform(audio_file):
|
160 |
try:
|
161 |
audio = AudioSegment.from_file(audio_file)
|
@@ -171,18 +171,27 @@ def show_waveform(audio_file):
|
|
171 |
except Exception as e:
|
172 |
return None
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
# === Session Info Export ===
|
175 |
-
def generate_session_log(audio_path, effects, isolate_vocals, export_format):
|
176 |
log = {
|
177 |
"timestamp": str(datetime.datetime.now()),
|
178 |
"filename": os.path.basename(audio_path),
|
179 |
"effects_applied": effects,
|
180 |
"isolate_vocals": isolate_vocals,
|
181 |
-
"export_format": export_format
|
|
|
182 |
}
|
183 |
return json.dumps(log, indent=2)
|
184 |
|
185 |
-
# === Main Processing Function ===
|
186 |
def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
|
187 |
status = "π Loading audio..."
|
188 |
try:
|
@@ -220,16 +229,42 @@ def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, exp
|
|
220 |
final_audio.export(output_path, format=export_format.lower())
|
221 |
|
222 |
waveform_image = show_waveform(output_path)
|
223 |
-
|
|
|
224 |
|
225 |
status = "π Done!"
|
226 |
-
return output_path, waveform_image, session_log, status
|
227 |
|
228 |
except Exception as e:
|
229 |
status = f"β Error: {str(e)}"
|
230 |
-
return None, None, status, status
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
-
# ===
|
233 |
whisper_model = WhisperModel("base")
|
234 |
|
235 |
def transcribe_audio(audio_path):
|
@@ -237,56 +272,126 @@ def transcribe_audio(audio_path):
|
|
237 |
text = " ".join([seg.text for seg in segments])
|
238 |
return text
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
# === Speaker Diarization Tab ===
|
241 |
try:
|
242 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
243 |
-
from huggingface_hub import login
|
244 |
|
245 |
hf_token = os.getenv("HF_TOKEN")
|
246 |
if hf_token:
|
247 |
-
|
248 |
else:
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
"pyannote/speaker-diarization",
|
253 |
-
use_auth_token=hf_token or True
|
254 |
-
)
|
255 |
-
except Exception as e:
|
256 |
-
print(f"β οΈ Failed to load diarization: {e}")
|
257 |
diarize_pipeline = None
|
|
|
258 |
|
259 |
def diarize_and_transcribe(audio_path):
|
260 |
-
if diarize_pipeline
|
261 |
-
return "β οΈ Diarization
|
262 |
|
263 |
# Run diarization
|
264 |
audio = AudioSegment.from_file(audio_path)
|
265 |
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
266 |
audio.export(temp_wav, format="wav")
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
try:
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
"text": text
|
283 |
-
})
|
284 |
-
|
285 |
-
return segments
|
286 |
except Exception as e:
|
287 |
-
return
|
288 |
|
289 |
-
# === UI
|
290 |
effect_options = [
|
291 |
"Noise Reduction",
|
292 |
"Compress Dynamic Range",
|
@@ -317,6 +422,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
317 |
gr.Audio(label="Processed Audio", type="filepath"),
|
318 |
gr.Image(label="Waveform Preview"),
|
319 |
gr.Textbox(label="Session Log (JSON)", lines=5),
|
|
|
320 |
gr.Textbox(label="Status", value="β
Ready", lines=1)
|
321 |
],
|
322 |
title="Edit One File at a Time",
|
@@ -326,17 +432,66 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
326 |
clear_btn=None
|
327 |
)
|
328 |
|
329 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
with gr.Tab("π Transcribe & Edit"):
|
331 |
gr.Interface(
|
332 |
fn=transcribe_audio,
|
333 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
334 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
335 |
-
title="Transcribe Spoken Content",
|
336 |
-
description="Convert voice to text
|
337 |
)
|
338 |
|
339 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
if diarize_pipeline:
|
341 |
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
342 |
gr.Interface(
|
@@ -344,54 +499,64 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
344 |
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
345 |
outputs=gr.JSON(label="Diarized Transcript"),
|
346 |
title="Split By Speaker + Transcribe",
|
347 |
-
description="
|
348 |
-
flagging_mode="never"
|
349 |
)
|
350 |
|
351 |
-
# ---
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
zip_path = os.path.join(output_dir, "batch_output.zip")
|
365 |
-
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
366 |
-
for i, res in enumerate(results):
|
367 |
-
filename = f"processed_{i}.{export_format.lower()}"
|
368 |
-
zipf.write(res, filename)
|
369 |
-
zipf.writestr(f"session_info_{i}.json", session_logs[i])
|
370 |
-
|
371 |
-
return zip_path, "π¦ ZIP created successfully!"
|
372 |
|
373 |
-
|
374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
-
|
|
|
377 |
gr.Interface(
|
378 |
-
fn=
|
379 |
inputs=[
|
380 |
-
gr.
|
381 |
-
gr.
|
382 |
-
gr.Checkbox(label="Isolate Vocals After Effects"),
|
383 |
-
gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
|
384 |
-
gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
|
385 |
],
|
386 |
-
outputs=
|
387 |
-
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
],
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
submit_btn="Process All Files",
|
394 |
-
clear_btn=None
|
395 |
)
|
396 |
|
397 |
demo.launch()
|
|
|
4 |
import tempfile
|
5 |
import os
|
6 |
import noisereduce as nr
|
7 |
+
import json
|
8 |
import torch
|
9 |
from demucs import pretrained
|
10 |
from demucs.apply import apply_model
|
|
|
13 |
import matplotlib.pyplot as plt
|
14 |
from io import BytesIO
|
15 |
from PIL import Image
|
16 |
+
import zipfile
|
|
|
|
|
17 |
import datetime
|
18 |
import librosa
|
19 |
import joblib
|
20 |
import warnings
|
21 |
+
from faster_whisper import WhisperModel
|
22 |
from mutagen.mp3 import MP3
|
23 |
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
24 |
|
|
|
121 |
for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
|
122 |
path = os.path.join(output_dir, f"{name}.wav")
|
123 |
save_track(path, sources[i].cpu(), model.samplerate)
|
124 |
+
stem_paths.append(gr.File(value=path))
|
125 |
|
126 |
+
return stem_paths
|
127 |
|
128 |
+
# === Preset Loader with Fallback ===
|
129 |
def load_presets():
|
130 |
try:
|
131 |
preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
|
|
|
155 |
|
156 |
preset_names = list(preset_choices.keys())
|
157 |
|
158 |
+
# === Waveform + Spectrogram Generator ===
|
159 |
def show_waveform(audio_file):
|
160 |
try:
|
161 |
audio = AudioSegment.from_file(audio_file)
|
|
|
171 |
except Exception as e:
|
172 |
return None
|
173 |
|
174 |
+
def detect_genre(audio_path):
|
175 |
+
try:
|
176 |
+
y, sr = torchaudio.load(audio_path)
|
177 |
+
mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
|
178 |
+
return "Speech"
|
179 |
+
except Exception:
|
180 |
+
return "Unknown"
|
181 |
+
|
182 |
# === Session Info Export ===
|
183 |
+
def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
|
184 |
log = {
|
185 |
"timestamp": str(datetime.datetime.now()),
|
186 |
"filename": os.path.basename(audio_path),
|
187 |
"effects_applied": effects,
|
188 |
"isolate_vocals": isolate_vocals,
|
189 |
+
"export_format": export_format,
|
190 |
+
"detected_genre": genre
|
191 |
}
|
192 |
return json.dumps(log, indent=2)
|
193 |
|
194 |
+
# === Main Processing Function with Status Updates ===
|
195 |
def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
|
196 |
status = "π Loading audio..."
|
197 |
try:
|
|
|
229 |
final_audio.export(output_path, format=export_format.lower())
|
230 |
|
231 |
waveform_image = show_waveform(output_path)
|
232 |
+
genre = detect_genre(output_path)
|
233 |
+
session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)
|
234 |
|
235 |
status = "π Done!"
|
236 |
+
return output_path, waveform_image, session_log, genre, status
|
237 |
|
238 |
except Exception as e:
|
239 |
status = f"β Error: {str(e)}"
|
240 |
+
return None, None, status, "", status
|
241 |
+
|
242 |
+
# === Batch Processing Function ===
|
243 |
+
def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
|
244 |
+
status = "π Loading files..."
|
245 |
+
try:
|
246 |
+
output_dir = tempfile.mkdtemp()
|
247 |
+
results = []
|
248 |
+
session_logs = []
|
249 |
+
|
250 |
+
for file in files:
|
251 |
+
processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
|
252 |
+
results.append(processed_path)
|
253 |
+
session_logs.append(log)
|
254 |
+
|
255 |
+
zip_path = os.path.join(output_dir, "batch_output.zip")
|
256 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
257 |
+
for i, res in enumerate(results):
|
258 |
+
filename = f"processed_{i}.{export_format.lower()}"
|
259 |
+
zipf.write(res, filename)
|
260 |
+
zipf.writestr(f"session_info_{i}.json", session_logs[i])
|
261 |
+
|
262 |
+
return zip_path, "π¦ ZIP created successfully!"
|
263 |
+
|
264 |
+
except Exception as e:
|
265 |
+
return None, f"β Batch processing failed: {str(e)}"
|
266 |
|
267 |
+
# === Whisper Transcription Tab ===
|
268 |
whisper_model = WhisperModel("base")
|
269 |
|
270 |
def transcribe_audio(audio_path):
|
|
|
272 |
text = " ".join([seg.text for seg in segments])
|
273 |
return text
|
274 |
|
275 |
+
# === TTS Tab ===
|
276 |
+
from TTS.api import TTS
|
277 |
+
|
278 |
+
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
279 |
+
|
280 |
+
def generate_tts(text):
|
281 |
+
out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
|
282 |
+
tts.tts_to_file(text=text, file_path=out_path)
|
283 |
+
return out_path
|
284 |
+
|
285 |
+
# === Analyze Audio Stats ===
|
286 |
+
def analyze_audio(audio_path):
|
287 |
+
y, sr = torchaudio.load(audio_path)
|
288 |
+
rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
|
289 |
+
tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
|
290 |
+
silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
|
291 |
+
|
292 |
+
plt.figure(figsize=(10, 4))
|
293 |
+
plt.plot(y.numpy().flatten(), color="lightblue")
|
294 |
+
plt.title("Loudness Over Time")
|
295 |
+
plt.tight_layout()
|
296 |
+
buf = BytesIO()
|
297 |
+
plt.savefig(buf, format="png")
|
298 |
+
plt.close()
|
299 |
+
image = Image.open(buf)
|
300 |
+
|
301 |
+
stats = {
|
302 |
+
"rms_loudness": float(rms),
|
303 |
+
"silence_ratio": float(silence_ratio),
|
304 |
+
"tempo_bpm": int(tempo)
|
305 |
+
}
|
306 |
+
|
307 |
+
return stats, image
|
308 |
+
|
309 |
# === Speaker Diarization Tab ===
|
310 |
try:
|
311 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
|
|
312 |
|
313 |
hf_token = os.getenv("HF_TOKEN")
|
314 |
if hf_token:
|
315 |
+
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
|
316 |
else:
|
317 |
+
diarize_pipeline = None
|
318 |
+
print("β οΈ No HF_TOKEN set β speaker diarization disabled")
|
319 |
+
except ImportError as e:
|
|
|
|
|
|
|
|
|
|
|
320 |
diarize_pipeline = None
|
321 |
+
print(f"β οΈ Could not load diarization: {e}")
|
322 |
|
323 |
def diarize_and_transcribe(audio_path):
|
324 |
+
if not diarize_pipeline:
|
325 |
+
return "β οΈ Diarization pipeline not loaded β check HF token or install pyannote.audio"
|
326 |
|
327 |
# Run diarization
|
328 |
audio = AudioSegment.from_file(audio_path)
|
329 |
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
330 |
audio.export(temp_wav, format="wav")
|
331 |
|
332 |
+
# Run diarization
|
333 |
+
diarization = diarize_pipeline(temp_wav)
|
334 |
+
|
335 |
+
# Run transcription
|
336 |
+
result = whisper.transcribe(temp_wav)
|
337 |
+
|
338 |
+
segments = []
|
339 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
340 |
+
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
341 |
+
segments.append({
|
342 |
+
"speaker": speaker,
|
343 |
+
"start": turn.start,
|
344 |
+
"end": turn.end,
|
345 |
+
"text": text
|
346 |
+
})
|
347 |
+
|
348 |
+
return segments
|
349 |
+
|
350 |
+
# === Save/Load Project File (.aiproj) ===
|
351 |
+
def save_project(audio_path, preset_name, effects):
|
352 |
+
project_data = {
|
353 |
+
"audio": AudioSegment.from_file(audio_path).raw_data,
|
354 |
+
"preset": preset_name,
|
355 |
+
"effects": effects
|
356 |
+
}
|
357 |
+
out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
|
358 |
+
with open(out_path, "wb") as f:
|
359 |
+
pickle.dump(project_data, f)
|
360 |
+
return out_path
|
361 |
+
|
362 |
+
# === Mix Two Tracks ===
|
363 |
+
def mix_tracks(track1, track2, volume_offset=0):
|
364 |
+
a1 = AudioSegment.from_file(track1)
|
365 |
+
a2 = AudioSegment.from_file(track2)
|
366 |
+
mixed = a1.overlay(a2 - volume_offset)
|
367 |
+
out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
|
368 |
+
mixed.export(out_path, format="wav")
|
369 |
+
return out_path
|
370 |
+
|
371 |
+
# === Voice Style Transfer (Dummy) ===
|
372 |
+
def apply_style_transfer(audio_path, mood="Happy"):
|
373 |
+
return audio_path
|
374 |
+
|
375 |
+
# === Metadata Tagging ===
|
376 |
+
def tag_mp3(file_path, title, artist, album, year):
|
377 |
try:
|
378 |
+
audio = MP3(file_path)
|
379 |
+
try:
|
380 |
+
audio.tags = ID3()
|
381 |
+
except:
|
382 |
+
audio.add_tags()
|
383 |
+
audio.tags.add(TIT2(encoding=3, text=title))
|
384 |
+
audio.tags.add(TPE1(encoding=3, text=artist))
|
385 |
+
if album:
|
386 |
+
audio.tags.add(TALB(encoding=3, text=album))
|
387 |
+
if year:
|
388 |
+
audio.tags.add(TYER(encoding=3, text=str(year)))
|
389 |
+
audio.save()
|
390 |
+
return file_path
|
|
|
|
|
|
|
|
|
391 |
except Exception as e:
|
392 |
+
return None
|
393 |
|
394 |
+
# === UI ===
|
395 |
effect_options = [
|
396 |
"Noise Reduction",
|
397 |
"Compress Dynamic Range",
|
|
|
422 |
gr.Audio(label="Processed Audio", type="filepath"),
|
423 |
gr.Image(label="Waveform Preview"),
|
424 |
gr.Textbox(label="Session Log (JSON)", lines=5),
|
425 |
+
gr.Textbox(label="Detected Genre", lines=1),
|
426 |
gr.Textbox(label="Status", value="β
Ready", lines=1)
|
427 |
],
|
428 |
title="Edit One File at a Time",
|
|
|
432 |
clear_btn=None
|
433 |
)
|
434 |
|
435 |
+
# --- Batch Processing ---
|
436 |
+
with gr.Tab("π Batch Processing"):
|
437 |
+
gr.Interface(
|
438 |
+
fn=batch_process_audio,
|
439 |
+
inputs=[
|
440 |
+
gr.File(label="Upload Multiple Files", file_count="multiple"),
|
441 |
+
gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
|
442 |
+
gr.Checkbox(label="Isolate Vocals After Effects"),
|
443 |
+
gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
|
444 |
+
gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
|
445 |
+
],
|
446 |
+
outputs=[
|
447 |
+
gr.File(label="Download ZIP of All Processed Files"),
|
448 |
+
gr.Textbox(label="Status", value="β
Ready", lines=1)
|
449 |
+
],
|
450 |
+
title="Batch Audio Processor",
|
451 |
+
description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
|
452 |
+
flagging_mode="never",
|
453 |
+
submit_btn="Process All Files",
|
454 |
+
clear_btn=None
|
455 |
+
)
|
456 |
+
|
457 |
+
# --- Remix Mode ---
|
458 |
+
with gr.Tab("π Remix Mode"):
|
459 |
+
gr.Interface(
|
460 |
+
fn=stem_split,
|
461 |
+
inputs=gr.Audio(label="Upload Music Track", type="filepath"),
|
462 |
+
outputs=[
|
463 |
+
gr.File(label="Vocals"),
|
464 |
+
gr.File(label="Drums"),
|
465 |
+
gr.File(label="Bass"),
|
466 |
+
gr.File(label="Other")
|
467 |
+
],
|
468 |
+
title="Split Into Drums, Bass, Vocals, and More",
|
469 |
+
description="Use AI to separate musical elements like vocals, drums, and bass.",
|
470 |
+
flagging_mode="never",
|
471 |
+
clear_btn=None
|
472 |
+
)
|
473 |
+
|
474 |
+
# --- Transcribe & Edit ---
|
475 |
with gr.Tab("π Transcribe & Edit"):
|
476 |
gr.Interface(
|
477 |
fn=transcribe_audio,
|
478 |
inputs=gr.Audio(label="Upload Audio", type="filepath"),
|
479 |
outputs=gr.Textbox(label="Transcribed Text", lines=10),
|
480 |
+
title="Transcribe & Edit Spoken Content",
|
481 |
+
description="Convert voice to text, then edit the script before exporting again."
|
482 |
)
|
483 |
|
484 |
+
# --- TTS Voice Generator ---
|
485 |
+
with gr.Tab("π¬ TTS Voice Generator"):
|
486 |
+
gr.Interface(
|
487 |
+
fn=generate_tts,
|
488 |
+
inputs=gr.Textbox(label="Enter Text", lines=5),
|
489 |
+
outputs=gr.Audio(label="Generated Speech", type="filepath"),
|
490 |
+
title="Text-to-Speech Generator",
|
491 |
+
description="Type anything and turn it into natural-sounding speech."
|
492 |
+
)
|
493 |
+
|
494 |
+
# --- Speaker Diarization ===
|
495 |
if diarize_pipeline:
|
496 |
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
497 |
gr.Interface(
|
|
|
499 |
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
500 |
outputs=gr.JSON(label="Diarized Transcript"),
|
501 |
title="Split By Speaker + Transcribe",
|
502 |
+
description="Detect speakers and transcribe their speech automatically."
|
|
|
503 |
)
|
504 |
|
505 |
+
# --- Load/Save Project ---
|
506 |
+
with gr.Tab("π Save/Load Project"):
|
507 |
+
gr.Interface(
|
508 |
+
fn=save_project,
|
509 |
+
inputs=[
|
510 |
+
gr.File(label="Original Audio"),
|
511 |
+
gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
|
512 |
+
gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
|
513 |
+
],
|
514 |
+
outputs=gr.File(label="Project File (.aiproj)"),
|
515 |
+
title="Save Everything Together",
|
516 |
+
description="Save your session, effects, and settings in one file to reuse later."
|
517 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
|
519 |
+
# --- Mix Two Tracks ---
|
520 |
+
with gr.Tab("π Mix Two Tracks"):
|
521 |
+
gr.Interface(
|
522 |
+
fn=mix_tracks,
|
523 |
+
inputs=[
|
524 |
+
gr.File(label="Main Track"),
|
525 |
+
gr.File(label="Background Track"),
|
526 |
+
gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
|
527 |
+
],
|
528 |
+
outputs=gr.File(label="Mixed Output"),
|
529 |
+
title="Overlay Two Tracks",
|
530 |
+
description="Mix or subtract two audio files."
|
531 |
+
)
|
532 |
|
533 |
+
# --- Voice Style Transfer ---
|
534 |
+
with gr.Tab("π§ Voice Style Transfer"):
|
535 |
gr.Interface(
|
536 |
+
fn=apply_style_transfer,
|
537 |
inputs=[
|
538 |
+
gr.Audio(label="Upload Voice Clip", type="filepath"),
|
539 |
+
gr.Radio(["Happy", "Sad", "Angry", "Calm"], label="Choose Tone")
|
|
|
|
|
|
|
540 |
],
|
541 |
+
outputs=gr.Audio(label="Stylized Output", type="filepath"),
|
542 |
+
title="Change Emotional Tone of Voice",
|
543 |
+
description="Shift the emotional style of any voice clip."
|
544 |
+
)
|
545 |
+
|
546 |
+
# --- Metadata Tagging ---
|
547 |
+
with gr.Tab("π Add MP3 Tags"):
|
548 |
+
gr.Interface(
|
549 |
+
fn=tag_mp3,
|
550 |
+
inputs=[
|
551 |
+
gr.File(label="Upload MP3/WAV"),
|
552 |
+
gr.Textbox(label="Title"),
|
553 |
+
gr.Textbox(label="Artist"),
|
554 |
+
gr.Textbox(label="Album"),
|
555 |
+
gr.Number(label="Year")
|
556 |
],
|
557 |
+
outputs=gr.File(label="Tagged Audio File"),
|
558 |
+
title="Add Title, Artist, Album, Year to MP3",
|
559 |
+
description="Enhance your exported files with metadata tags"
|
|
|
|
|
560 |
)
|
561 |
|
562 |
demo.launch()
|