fffiloni commited on
Commit
396365e
·
verified ·
1 Parent(s): f2e99a3

gradio MCP mode ready

Browse files
Files changed (1) hide show
  1. gradio_app.py +64 -14
gradio_app.py CHANGED
@@ -72,6 +72,23 @@ def separate_speakers_core(audio_path):
72
 
73
  @spaces.GPU()
74
  def separate_dnr(audio_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  audio, sr = torchaudio.load(audio_file)
76
  audio = audio.to(device)
77
 
@@ -96,6 +113,21 @@ def separate_dnr(audio_file):
96
 
97
  @spaces.GPU()
98
  def separate_speakers(audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  output_files = separate_speakers_core(audio_path)
100
  updates = []
101
  for i in range(MAX_SPEAKERS):
@@ -107,6 +139,22 @@ def separate_speakers(audio_path):
107
 
108
  @spaces.GPU()
109
  def separate_dnr_video(video_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  audio_path, video = extract_audio_from_video(video_path, 44100)
111
  dialog_path, effect_path, music_path = separate_dnr(audio_path)
112
 
@@ -120,19 +168,24 @@ def separate_dnr_video(video_path):
120
 
121
  return dialog_video, effect_video, music_video
122
 
123
- def convert_to_ffmpeg_friendly(input_wav, output_wav):
124
- subprocess.run([
125
- "ffmpeg", "-y",
126
- "-i", input_wav,
127
- "-ar", str(TARGET_SR),
128
- "-ac", "1",
129
- "-sample_fmt", "s16",
130
- output_wav
131
- ], check=True)
132
-
133
 
134
  @spaces.GPU()
135
  def separate_speakers_video(video_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  audio_path, video = extract_audio_from_video(video_path, 16000)
137
  output_files = separate_speakers_core(audio_path)
138
 
@@ -155,9 +208,6 @@ def separate_speakers_video(video_path):
155
  return updates
156
 
157
 
158
-
159
-
160
-
161
  # --- Gradio UI ---
162
  with gr.Blocks() as demo:
163
  gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
@@ -209,4 +259,4 @@ with gr.Blocks() as demo:
209
  vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
210
 
211
  if __name__ == "__main__":
212
- demo.launch(ssr_mode=False)
 
72
 
73
  @spaces.GPU()
74
  def separate_dnr(audio_file):
75
+ """
76
+ Perform Dialog, Effects, and Music (DnR) separation on an uploaded audio file.
77
+
78
+ Args:
79
+ audio_file (str): File path to the input WAV audio file.
80
+ This should be a mixed audio track containing dialog, background music, and sound effects.
81
+
82
+ Returns:
83
+ Tuple[str, str, str]: Paths to the separated audio files:
84
+ - Dialog-only audio (dialog.wav)
85
+ - Sound effects-only audio (effect.wav)
86
+ - Background music-only audio (music.wav)
87
+
88
+ This function uses a pretrained DnR model (TIGER-DnR) to isolate the components in the audio.
89
+ It is intended for tasks such as improving intelligibility or remixing.
90
+ """
91
+
92
  audio, sr = torchaudio.load(audio_file)
93
  audio = audio.to(device)
94
 
 
113
 
114
  @spaces.GPU()
115
  def separate_speakers(audio_path):
116
+ """
117
+ Perform speaker separation on a mixed audio file containing multiple speakers.
118
+
119
+ Args:
120
+ audio_path (str): File path to the audio WAV file containing overlapping speech from multiple people.
121
+
122
+ Returns:
123
+ List[gr.update]: A list of Gradio update objects, each containing:
124
+ - A separate audio file for each identified speaker (up to MAX_SPEAKERS)
125
+ - Visibility and label updates for the UI
126
+
127
+ This function internally calls a pretrained speech separation model (TIGER-speech)
128
+ and isolates individual speaker tracks from the input audio.
129
+ """
130
+
131
  output_files = separate_speakers_core(audio_path)
132
  updates = []
133
  for i in range(MAX_SPEAKERS):
 
139
 
140
  @spaces.GPU()
141
  def separate_dnr_video(video_path):
142
+ """
143
+ Separate dialog, effects, and music from the audio of an uploaded video file and reattach them to the original video.
144
+
145
+ Args:
146
+ video_path (str): File path to the input video file (e.g., MP4 or MOV).
147
+ The video should contain a composite audio track with dialog, effects, and music.
148
+
149
+ Returns:
150
+ Tuple[str, str, str]: Paths to the output videos with:
151
+ - Only dialog audio track (dialog_video.mp4)
152
+ - Only effects audio track (effect_video.mp4)
153
+ - Only music audio track (music_video.mp4)
154
+
155
+ The audio is extracted from the video, separated using the DnR model, and then reattached to the original video visuals.
156
+ """
157
+
158
  audio_path, video = extract_audio_from_video(video_path, 44100)
159
  dialog_path, effect_path, music_path = separate_dnr(audio_path)
160
 
 
168
 
169
  return dialog_video, effect_video, music_video
170
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  @spaces.GPU()
173
  def separate_speakers_video(video_path):
174
+ """
175
+ Separate individual speakers from the audio track of a video and reattach each speaker’s voice to a copy of the original video.
176
+
177
+ Args:
178
+ video_path (str): File path to a video file with overlapping speech from multiple speakers.
179
+
180
+ Returns:
181
+ List[gr.update]: A list of Gradio update objects each containing:
182
+ - A new video file where the audio consists of only one speaker's voice
183
+ - Visibility and label information for UI display
184
+
185
+ The function extracts audio from the video, separates individual speakers using a pretrained model,
186
+ and generates one video per speaker by replacing the audio in the original video.
187
+ """
188
+
189
  audio_path, video = extract_audio_from_video(video_path, 16000)
190
  output_files = separate_speakers_core(audio_path)
191
 
 
208
  return updates
209
 
210
 
 
 
 
211
  # --- Gradio UI ---
212
  with gr.Blocks() as demo:
213
  gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
 
259
  vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
260
 
261
  if __name__ == "__main__":
262
+ demo.launch(ssr_mode=False, mcp_server=True)