Rafii commited on
Commit
9e2aed0
Β·
1 Parent(s): a3fb860
Files changed (2) hide show
  1. app.py +108 -0
  2. test_audios/.DS_Store +0 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisperx
3
+ import os
4
+ import tempfile
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+ hf_token = os.getenv("hf_token")
10
+
11
+ # Model config
12
+ device = "cpu"
13
+ batch_size = 16
14
+ compute_type = "int8"
15
+
16
+ # Load main model
17
+ model = whisperx.load_model("large-v3", device, compute_type=compute_type)
18
+
19
+ title = "πŸŽ™οΈ Multilingual Audio Processor"
20
+ description = "Upload an audio file and select whether to transcribe, align words, or identify speakers (Powered by WhisperX)."
21
+
22
+ def clean_alignment(result):
23
+ cleaned_segments = []
24
+ for seg in result.get("segments", []):
25
+ cleaned_words = []
26
+ for word in seg.get("words", []):
27
+ cleaned_words.append({
28
+ "word": word["word"],
29
+ "start": float(word["start"]),
30
+ "end": float(word["end"]),
31
+ "score": float(word["score"])
32
+ })
33
+ cleaned_segments.append({
34
+ "text": seg["text"],
35
+ "start": float(seg["start"]),
36
+ "end": float(seg["end"]),
37
+ "words": cleaned_words
38
+ })
39
+ return {"segments": cleaned_segments}
40
+
41
+ def process_audio(audio_path, transcribe=True, align=False, diarize=False):
42
+ transcript_output = ""
43
+ align_output = {}
44
+ diarize_output = ""
45
+
46
+ audio = whisperx.load_audio(audio_path)
47
+ result = None
48
+
49
+ # Step 1: Transcribe
50
+ # if transcribe:
51
+ result = model.transcribe(audio, batch_size=batch_size)
52
+ transcript_output = " ".join(seg["text"] for seg in result["segments"])
53
+
54
+ # Step 2: Align
55
+ if align and result:
56
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
57
+ result = whisperx.align(result["segments"], model_a, metadata, audio, device)
58
+ align_output = clean_alignment(result)
59
+
60
+ # Step 3: Diarization
61
+ if diarize and result:
62
+ diarize_model = whisperx.diarize.DiarizationPipeline(
63
+ use_auth_token=hf_token,
64
+ device=device
65
+ )
66
+ diarize_segments = diarize_model(audio)
67
+ result = whisperx.assign_word_speakers(diarize_segments, result)
68
+ diarize_output = [
69
+ {
70
+ "start": float(seg["start"]),
71
+ "end": float(seg["end"]),
72
+ "speaker": seg.get("speaker", "SPEAKER_00"),
73
+ "text": seg["text"]
74
+ } for seg in result["segments"]
75
+ ]
76
+
77
+ return transcript_output , align_output or {}, diarize_output or "No diarization."
78
+
79
+ with gr.Blocks(title=title, theme=gr.themes.Default(), analytics_enabled=True) as demo:
80
+ gr.Markdown(f"<h1 style='text-align: center;font-size: 40px;'>{title}</h1>")
81
+ gr.Markdown(f"<p style='text-align: center; font-size: 16px;'>{description}</p>")
82
+ with gr.Row():
83
+ with gr.Column(scale=1):
84
+ audio_input = gr.Audio(type="filepath", label="Upload Audio")
85
+ transcribe_checkbox = gr.Markdown("βœ… Transcription will always be performed.")
86
+ align_checkbox = gr.Checkbox(label="Align")
87
+ diarize_checkbox = gr.Checkbox(label="Diarize")
88
+ gr.Markdown("### <span style='font-size: 18px;'>🎧 Try Sample Audio</span>")
89
+ gr.Examples(
90
+ examples=[[f"test_audios/{audio_file}"] for audio_file in os.listdir("test_audios") if audio_file.endswith(('.mp3', '.wav'))],
91
+ inputs=[audio_input],
92
+ label=""
93
+ )
94
+ with gr.Column(scale=2):
95
+ transcript_output = gr.Textbox(label="πŸ“„ Transcript", lines=10, interactive=False)
96
+ alignment_output = gr.JSON(label="🧭 Word Alignment")
97
+ diarization_output = gr.JSON(label="πŸ—£οΈ Speaker Diarization")
98
+ with gr.Row():
99
+ process_button = gr.Button("Process")
100
+
101
+ process_button.click(
102
+ fn=process_audio,
103
+ inputs=[audio_input, transcribe_checkbox, align_checkbox, diarize_checkbox],
104
+ outputs=[transcript_output, alignment_output, diarization_output]
105
+ )
106
+
107
+ if __name__ == "__main__":
108
+ demo.launch(share=True)
test_audios/.DS_Store ADDED
Binary file (6.15 kB). View file