fffiloni commited on
Commit
03e6ae6
·
verified ·
1 Parent(s): d9599f2

add video process tabs

Browse files
Files changed (1) hide show
  1. gradio_app.py +100 -56
gradio_app.py CHANGED
@@ -6,22 +6,32 @@ import torchaudio.transforms as T
6
  import soundfile as sf
7
  import gradio as gr
8
  import spaces
 
9
  import look2hear.models
10
 
11
- # Setup device
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  # Load models
15
- dnr_model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache")
16
- dnr_model.to(device).eval()
17
-
18
- sep_model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir="cache")
19
- sep_model.to(device).eval()
20
 
21
  TARGET_SR = 16000
22
  MAX_SPEAKERS = 4
23
 
24
- # --- DnR Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  @spaces.GPU()
26
  def separate_dnr(audio_file):
27
  audio, sr = torchaudio.load(audio_file)
@@ -30,22 +40,22 @@ def separate_dnr(audio_file):
30
  with torch.no_grad():
31
  dialog, effect, music = dnr_model(audio[None])
32
 
33
- # Unique output folder
34
  session_id = uuid.uuid4().hex[:8]
35
  output_dir = os.path.join("output_dnr", session_id)
36
  os.makedirs(output_dir, exist_ok=True)
37
 
38
- dialog_path = os.path.join(output_dir, "dialog.wav")
39
- effect_path = os.path.join(output_dir, "effect.wav")
40
- music_path = os.path.join(output_dir, "music.wav")
 
 
41
 
42
- torchaudio.save(dialog_path, dialog.cpu(), sr)
43
- torchaudio.save(effect_path, effect.cpu(), sr)
44
- torchaudio.save(music_path, music.cpu(), sr)
45
 
46
- return dialog_path, effect_path, music_path
47
 
48
- # --- Speaker Separation Function ---
49
  @spaces.GPU()
50
  def separate_speakers(audio_path):
51
  waveform, original_sr = torchaudio.load(audio_path)
@@ -57,11 +67,8 @@ def separate_speakers(audio_path):
57
  audio_input = waveform.unsqueeze(0).to(device)
58
 
59
  with torch.no_grad():
60
- ests_speech = sep_model(audio_input)
61
-
62
- ests_speech = ests_speech.squeeze(0)
63
 
64
- # Unique output folder
65
  session_id = uuid.uuid4().hex[:8]
66
  output_dir = os.path.join("output_sep", session_id)
67
  os.makedirs(output_dir, exist_ok=True)
@@ -69,8 +76,7 @@ def separate_speakers(audio_path):
69
  output_files = []
70
  for i in range(ests_speech.shape[0]):
71
  path = os.path.join(output_dir, f"speaker_{i+1}.wav")
72
- audio_np = ests_speech[i].cpu().numpy()
73
- sf.write(path, audio_np.T, TARGET_SR) # Transpose only if shape is [T, C], usually not needed
74
  output_files.append(path)
75
 
76
  updates = []
@@ -81,7 +87,57 @@ def separate_speakers(audio_path):
81
  updates.append(gr.update(value=None, visible=False))
82
  return updates
83
 
84
- # --- Gradio App ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  with gr.Blocks() as demo:
86
  gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
87
  gr.Markdown("TIGER is a lightweight model for speech separation which effectively extracts key acoustic features through frequency band-split, multi-scale and full-frequency-frame modeling.")
@@ -97,51 +153,39 @@ with gr.Blocks() as demo:
97
  </a>
98
  </div>
99
  """)
100
- with gr.Tabs():
101
- # --- Tab 1: DnR ---
102
- with gr.Tab("Dialog/Effects/Music Separation (DnR)"):
103
- gr.Markdown("### Separate Dialog, Effects, and Music from Mixed Audio")
104
-
105
- dnr_input = gr.Audio(type="filepath", label="Upload Audio File")
106
- dnr_button = gr.Button("Separate Audio")
107
 
 
 
 
 
108
  gr.Examples(
109
  examples = ["./test/test_mixture_466.wav"],
110
  inputs = dnr_input
111
  )
 
 
112
 
113
- dnr_output_dialog = gr.Audio(label="Dialog", type="filepath")
114
- dnr_output_effect = gr.Audio(label="Effects", type="filepath")
115
- dnr_output_music = gr.Audio(label="Music", type="filepath")
116
-
117
- dnr_button.click(
118
- fn=separate_dnr,
119
- inputs=dnr_input,
120
- outputs=[dnr_output_dialog, dnr_output_effect, dnr_output_music]
121
- )
122
-
123
- # --- Tab 2: Speaker Separation ---
124
- with gr.Tab("Speaker Separation"):
125
- gr.Markdown("### Separate Individual Speakers from Mixed Speech")
126
-
127
  sep_input = gr.Audio(type="filepath", label="Upload Speech Audio")
128
- sep_button = gr.Button("Separate Speakers")
129
-
130
  gr.Examples(
131
  examples = ["./test/mix.wav"],
132
  inputs = sep_input
133
  )
 
 
134
 
135
- gr.Markdown("#### Separated Speakers")
136
- sep_outputs = []
137
- for i in range(MAX_SPEAKERS):
138
- sep_outputs.append(gr.Audio(label=f"Speaker {i+1}", visible=(i == 0), interactive=False))
 
139
 
140
- sep_button.click(
141
- fn=separate_speakers,
142
- inputs=sep_input,
143
- outputs=sep_outputs
144
- )
145
 
146
  if __name__ == "__main__":
147
- demo.launch()
 
6
  import soundfile as sf
7
  import gradio as gr
8
  import spaces
9
+ from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
10
  import look2hear.models
11
 
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  # Load models
15
+ dnr_model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache").to(device).eval()
16
+ sep_model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir="cache").to(device).eval()
 
 
 
17
 
18
  TARGET_SR = 16000
19
  MAX_SPEAKERS = 4
20
 
21
+ def extract_audio_from_video(video_path):
22
+ video = VideoFileClip(video_path)
23
+ session_id = uuid.uuid4().hex[:8]
24
+ audio_path = f"temp_audio/{session_id}.wav"
25
+ os.makedirs("temp_audio", exist_ok=True)
26
+ video.audio.write_audiofile(audio_path, fps=44100, verbose=False, logger=None)
27
+ return audio_path, video
28
+
29
+ def attach_audio_to_video(original_video, audio_path, out_path):
30
+ new_audio = AudioFileClip(audio_path)
31
+ new_video = original_video.set_audio(new_audio)
32
+ new_video.write_videofile(out_path, audio_codec='aac', verbose=False, logger=None)
33
+ return out_path
34
+
35
  @spaces.GPU()
36
  def separate_dnr(audio_file):
37
  audio, sr = torchaudio.load(audio_file)
 
40
  with torch.no_grad():
41
  dialog, effect, music = dnr_model(audio[None])
42
 
 
43
  session_id = uuid.uuid4().hex[:8]
44
  output_dir = os.path.join("output_dnr", session_id)
45
  os.makedirs(output_dir, exist_ok=True)
46
 
47
+ paths = {
48
+ "dialog": os.path.join(output_dir, "dialog.wav"),
49
+ "effect": os.path.join(output_dir, "effect.wav"),
50
+ "music": os.path.join(output_dir, "music.wav"),
51
+ }
52
 
53
+ torchaudio.save(paths["dialog"], dialog.cpu(), sr)
54
+ torchaudio.save(paths["effect"], effect.cpu(), sr)
55
+ torchaudio.save(paths["music"], music.cpu(), sr)
56
 
57
+ return paths["dialog"], paths["effect"], paths["music"]
58
 
 
59
  @spaces.GPU()
60
  def separate_speakers(audio_path):
61
  waveform, original_sr = torchaudio.load(audio_path)
 
67
  audio_input = waveform.unsqueeze(0).to(device)
68
 
69
  with torch.no_grad():
70
+ ests_speech = sep_model(audio_input).squeeze(0)
 
 
71
 
 
72
  session_id = uuid.uuid4().hex[:8]
73
  output_dir = os.path.join("output_sep", session_id)
74
  os.makedirs(output_dir, exist_ok=True)
 
76
  output_files = []
77
  for i in range(ests_speech.shape[0]):
78
  path = os.path.join(output_dir, f"speaker_{i+1}.wav")
79
+ sf.write(path, ests_speech[i].cpu().numpy(), TARGET_SR)
 
80
  output_files.append(path)
81
 
82
  updates = []
 
87
  updates.append(gr.update(value=None, visible=False))
88
  return updates
89
 
90
+ @spaces.GPU()
91
+ def separate_dnr_video(video_path):
92
+ audio_path, video = extract_audio_from_video(video_path)
93
+ dialog_path, effect_path, music_path = separate_dnr(audio_path)
94
+
95
+ session_id = uuid.uuid4().hex[:8]
96
+ output_dir = os.path.join("output_dnr_video", session_id)
97
+ os.makedirs(output_dir, exist_ok=True)
98
+
99
+ dialog_video = attach_audio_to_video(video, dialog_path, os.path.join(output_dir, "dialog_video.mp4"))
100
+ effect_video = attach_audio_to_video(video, effect_path, os.path.join(output_dir, "effect_video.mp4"))
101
+ music_video = attach_audio_to_video(video, music_path, os.path.join(output_dir, "music_video.mp4"))
102
+
103
+ return dialog_video, effect_video, music_video
104
+
105
+ @spaces.GPU()
106
+ def separate_speakers_video(video_path):
107
+ audio_path, video = extract_audio_from_video(video_path)
108
+
109
+ waveform, original_sr = torchaudio.load(audio_path)
110
+ if original_sr != TARGET_SR:
111
+ waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
112
+
113
+ if waveform.dim() == 1:
114
+ waveform = waveform.unsqueeze(0)
115
+ audio_input = waveform.unsqueeze(0).to(device)
116
+
117
+ with torch.no_grad():
118
+ ests_speech = sep_model(audio_input).squeeze(0)
119
+
120
+ session_id = uuid.uuid4().hex[:8]
121
+ output_dir = os.path.join("output_sep_video", session_id)
122
+ os.makedirs(output_dir, exist_ok=True)
123
+
124
+ output_videos = []
125
+ for i in range(ests_speech.shape[0]):
126
+ path = os.path.join(output_dir, f"speaker_{i+1}.wav")
127
+ sf.write(path, ests_speech[i].cpu().numpy(), TARGET_SR)
128
+ video_path = os.path.join(output_dir, f"speaker_{i+1}_video.mp4")
129
+ attach_audio_to_video(video, path, video_path)
130
+ output_videos.append(video_path)
131
+
132
+ updates = []
133
+ for i in range(MAX_SPEAKERS):
134
+ if i < len(output_videos):
135
+ updates.append(gr.update(value=output_videos[i], visible=True, label=f"Speaker {i+1}"))
136
+ else:
137
+ updates.append(gr.update(value=None, visible=False))
138
+ return updates
139
+
140
+ # --- Gradio UI ---
141
  with gr.Blocks() as demo:
142
  gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
143
  gr.Markdown("TIGER is a lightweight model for speech separation which effectively extracts key acoustic features through frequency band-split, multi-scale and full-frequency-frame modeling.")
 
153
  </a>
154
  </div>
155
  """)
 
 
 
 
 
 
 
156
 
157
+ with gr.Tabs():
158
+ with gr.Tab("Audio DnR"):
159
+ dnr_input = gr.Audio(type="filepath", label="Upload Audio")
160
+ dnr_btn = gr.Button("Separate")
161
  gr.Examples(
162
  examples = ["./test/test_mixture_466.wav"],
163
  inputs = dnr_input
164
  )
165
+ dnr_output = [gr.Audio(label=l) for l in ["Dialog", "Effects", "Music"]]
166
+ dnr_btn.click(separate_dnr, inputs=dnr_input, outputs=dnr_output)
167
 
168
+ with gr.Tab("Audio Speaker Separation"):
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  sep_input = gr.Audio(type="filepath", label="Upload Speech Audio")
170
+ sep_btn = gr.Button("Separate Speakers")
 
171
  gr.Examples(
172
  examples = ["./test/mix.wav"],
173
  inputs = sep_input
174
  )
175
+ sep_outputs = [gr.Audio(label=f"Speaker {i+1}", visible=(i==0)) for i in range(MAX_SPEAKERS)]
176
+ sep_btn.click(separate_speakers, inputs=sep_input, outputs=sep_outputs)
177
 
178
+ with gr.Tab("Video DnR"):
179
+ vdnr_input = gr.Video(label="Upload Video")
180
+ vdnr_btn = gr.Button("Separate Audio Tracks")
181
+ vdnr_output = [gr.Video(label=l) for l in ["Dialog Video", "Effects Video", "Music Video"]]
182
+ vdnr_btn.click(separate_dnr_video, inputs=vdnr_input, outputs=vdnr_output)
183
 
184
+ with gr.Tab("Video Speaker Separation"):
185
+ vsep_input = gr.Video(label="Upload Video")
186
+ vsep_btn = gr.Button("Separate Speakers")
187
+ vsep_outputs = [gr.Video(label=f"Speaker {i+1}", visible=(i==0)) for i in range(MAX_SPEAKERS)]
188
+ vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
189
 
190
  if __name__ == "__main__":
191
+ demo.launch()