Spaces:
Running
on
Zero
Running
on
Zero
gradio MCP mode ready
Browse files- gradio_app.py +64 -14
gradio_app.py
CHANGED
@@ -72,6 +72,23 @@ def separate_speakers_core(audio_path):
|
|
72 |
|
73 |
@spaces.GPU()
|
74 |
def separate_dnr(audio_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
audio, sr = torchaudio.load(audio_file)
|
76 |
audio = audio.to(device)
|
77 |
|
@@ -96,6 +113,21 @@ def separate_dnr(audio_file):
|
|
96 |
|
97 |
@spaces.GPU()
|
98 |
def separate_speakers(audio_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
output_files = separate_speakers_core(audio_path)
|
100 |
updates = []
|
101 |
for i in range(MAX_SPEAKERS):
|
@@ -107,6 +139,22 @@ def separate_speakers(audio_path):
|
|
107 |
|
108 |
@spaces.GPU()
|
109 |
def separate_dnr_video(video_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
audio_path, video = extract_audio_from_video(video_path, 44100)
|
111 |
dialog_path, effect_path, music_path = separate_dnr(audio_path)
|
112 |
|
@@ -120,19 +168,24 @@ def separate_dnr_video(video_path):
|
|
120 |
|
121 |
return dialog_video, effect_video, music_video
|
122 |
|
123 |
-
def convert_to_ffmpeg_friendly(input_wav, output_wav):
|
124 |
-
subprocess.run([
|
125 |
-
"ffmpeg", "-y",
|
126 |
-
"-i", input_wav,
|
127 |
-
"-ar", str(TARGET_SR),
|
128 |
-
"-ac", "1",
|
129 |
-
"-sample_fmt", "s16",
|
130 |
-
output_wav
|
131 |
-
], check=True)
|
132 |
-
|
133 |
|
134 |
@spaces.GPU()
|
135 |
def separate_speakers_video(video_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
audio_path, video = extract_audio_from_video(video_path, 16000)
|
137 |
output_files = separate_speakers_core(audio_path)
|
138 |
|
@@ -155,9 +208,6 @@ def separate_speakers_video(video_path):
|
|
155 |
return updates
|
156 |
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
# --- Gradio UI ---
|
162 |
with gr.Blocks() as demo:
|
163 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|
@@ -209,4 +259,4 @@ with gr.Blocks() as demo:
|
|
209 |
vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
|
210 |
|
211 |
if __name__ == "__main__":
|
212 |
-
demo.launch(ssr_mode=False)
|
|
|
72 |
|
73 |
@spaces.GPU()
|
74 |
def separate_dnr(audio_file):
|
75 |
+
"""
|
76 |
+
Perform Dialog, Effects, and Music (DnR) separation on an uploaded audio file.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
audio_file (str): File path to the input WAV audio file.
|
80 |
+
This should be a mixed audio track containing dialog, background music, and sound effects.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
Tuple[str, str, str]: Paths to the separated audio files:
|
84 |
+
- Dialog-only audio (dialog.wav)
|
85 |
+
- Sound effects-only audio (effect.wav)
|
86 |
+
- Background music-only audio (music.wav)
|
87 |
+
|
88 |
+
This function uses a pretrained DnR model (TIGER-DnR) to isolate the components in the audio.
|
89 |
+
It is intended for tasks such as improving intelligibility or remixing.
|
90 |
+
"""
|
91 |
+
|
92 |
audio, sr = torchaudio.load(audio_file)
|
93 |
audio = audio.to(device)
|
94 |
|
|
|
113 |
|
114 |
@spaces.GPU()
|
115 |
def separate_speakers(audio_path):
|
116 |
+
"""
|
117 |
+
Perform speaker separation on a mixed audio file containing multiple speakers.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
audio_path (str): File path to the audio WAV file containing overlapping speech from multiple people.
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
List[gr.update]: A list of Gradio update objects, each containing:
|
124 |
+
- A separate audio file for each identified speaker (up to MAX_SPEAKERS)
|
125 |
+
- Visibility and label updates for the UI
|
126 |
+
|
127 |
+
This function internally calls a pretrained speech separation model (TIGER-speech)
|
128 |
+
and isolates individual speaker tracks from the input audio.
|
129 |
+
"""
|
130 |
+
|
131 |
output_files = separate_speakers_core(audio_path)
|
132 |
updates = []
|
133 |
for i in range(MAX_SPEAKERS):
|
|
|
139 |
|
140 |
@spaces.GPU()
|
141 |
def separate_dnr_video(video_path):
|
142 |
+
"""
|
143 |
+
Separate dialog, effects, and music from the audio of an uploaded video file and reattach them to the original video.
|
144 |
+
|
145 |
+
Args:
|
146 |
+
video_path (str): File path to the input video file (e.g., MP4 or MOV).
|
147 |
+
The video should contain a composite audio track with dialog, effects, and music.
|
148 |
+
|
149 |
+
Returns:
|
150 |
+
Tuple[str, str, str]: Paths to the output videos with:
|
151 |
+
- Only dialog audio track (dialog_video.mp4)
|
152 |
+
- Only effects audio track (effect_video.mp4)
|
153 |
+
- Only music audio track (music_video.mp4)
|
154 |
+
|
155 |
+
The audio is extracted from the video, separated using the DnR model, and then reattached to the original video visuals.
|
156 |
+
"""
|
157 |
+
|
158 |
audio_path, video = extract_audio_from_video(video_path, 44100)
|
159 |
dialog_path, effect_path, music_path = separate_dnr(audio_path)
|
160 |
|
|
|
168 |
|
169 |
return dialog_video, effect_video, music_video
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
@spaces.GPU()
|
173 |
def separate_speakers_video(video_path):
|
174 |
+
"""
|
175 |
+
Separate individual speakers from the audio track of a video and reattach each speaker’s voice to a copy of the original video.
|
176 |
+
|
177 |
+
Args:
|
178 |
+
video_path (str): File path to a video file with overlapping speech from multiple speakers.
|
179 |
+
|
180 |
+
Returns:
|
181 |
+
List[gr.update]: A list of Gradio update objects each containing:
|
182 |
+
- A new video file where the audio consists of only one speaker's voice
|
183 |
+
- Visibility and label information for UI display
|
184 |
+
|
185 |
+
The function extracts audio from the video, separates individual speakers using a pretrained model,
|
186 |
+
and generates one video per speaker by replacing the audio in the original video.
|
187 |
+
"""
|
188 |
+
|
189 |
audio_path, video = extract_audio_from_video(video_path, 16000)
|
190 |
output_files = separate_speakers_core(audio_path)
|
191 |
|
|
|
208 |
return updates
|
209 |
|
210 |
|
|
|
|
|
|
|
211 |
# --- Gradio UI ---
|
212 |
with gr.Blocks() as demo:
|
213 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|
|
|
259 |
vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
|
260 |
|
261 |
if __name__ == "__main__":
|
262 |
+
demo.launch(ssr_mode=False, mcp_server=True)
|