Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import ffmpy
|
2 |
import asyncio
|
3 |
import edge_tts
|
@@ -22,6 +23,7 @@ text_to_speech = 'text_to_speech.wav'
|
|
22 |
vocals = f'./{folder}/{file_name}/vocals.wav'
|
23 |
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
|
24 |
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
|
|
|
25 |
output_rate_audio = 'output_rate_audio.wav'
|
26 |
output_audio = 'output.wav'
|
27 |
output_video = 'output.mp4'
|
@@ -31,14 +33,26 @@ def gain_time(audio):
|
|
31 |
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
32 |
return float(result.stdout)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
34 |
def time_verify():
|
35 |
audios = [vocals_monorail, text_to_speech]
|
|
|
36 |
time_lists = []
|
37 |
|
38 |
for audio in audios:
|
|
|
39 |
time_lists.append(gain_time(audio))
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
42 |
return r_time
|
43 |
|
44 |
def translator(text, TR_LANGUAGE, LANGUAGE):
|
@@ -77,12 +91,10 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
|
|
77 |
}
|
78 |
)
|
79 |
ff.run()
|
80 |
-
except ffmpy.FFRuntimeError:
|
81 |
-
raise gr.Error('Mismatched audio!')
|
82 |
-
|
83 |
-
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
|
84 |
|
85 |
-
|
|
|
|
|
86 |
inputs={
|
87 |
vocals: None
|
88 |
},
|
@@ -90,14 +102,18 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
|
|
90 |
vocals_monorail: ['-y', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
|
91 |
}
|
92 |
)
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
|
98 |
'transcribe', # str in 'Task' Radio component
|
99 |
api_name='/predict'
|
100 |
-
|
|
|
|
|
|
|
|
|
101 |
|
102 |
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
|
103 |
|
@@ -106,18 +122,40 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
|
|
106 |
await communicate.save(text_to_speech)
|
107 |
asyncio.run(amain())
|
108 |
|
109 |
-
|
110 |
ff = ffmpy.FFmpeg(
|
111 |
-
inputs={
|
|
|
|
|
112 |
outputs={
|
113 |
-
output_rate_audio: ['-y', '-filter:a', f'atempo={
|
114 |
}
|
115 |
)
|
116 |
ff.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
ff = ffmpy.FFmpeg(
|
119 |
inputs={
|
120 |
-
|
121 |
accompaniment: None
|
122 |
},
|
123 |
outputs={
|
@@ -132,7 +170,7 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
|
|
132 |
)
|
133 |
ff.run()
|
134 |
|
135 |
-
return output_video
|
136 |
|
137 |
with gr.Blocks() as demo:
|
138 |
TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
|
@@ -148,6 +186,12 @@ with gr.Blocks() as demo:
|
|
148 |
],
|
149 |
outputs=[
|
150 |
gr.Video(height=320, width=600, label='Output_video'),
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
],
|
152 |
title="Short-Video-To-Video",
|
153 |
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator, Use other [translators](https://github.com/DUQIA/Short-Video-To-Video/blob/main/README.md#use-other-translators). Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
|
|
|
1 |
+
import re
|
2 |
import ffmpy
|
3 |
import asyncio
|
4 |
import edge_tts
|
|
|
23 |
vocals = f'./{folder}/{file_name}/vocals.wav'
|
24 |
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
|
25 |
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
|
26 |
+
output_left_audio = 'output_left_audio.wav'
|
27 |
output_rate_audio = 'output_rate_audio.wav'
|
28 |
output_audio = 'output.wav'
|
29 |
output_video = 'output.mp4'
|
|
|
33 |
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
34 |
return float(result.stdout)
|
35 |
|
36 |
+
def left_justified(audio):
|
37 |
+
command = ['ffmpeg', '-i', audio, '-af', 'silencedetect=n=-38dB:d=0.01', '-f', 'null', '-']
|
38 |
+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
39 |
+
return re.search(r'silence_duration: (\d.\d+)', result.stdout.decode(), re.M|re.S).group(1)
|
40 |
+
|
41 |
def time_verify():
|
42 |
audios = [vocals_monorail, text_to_speech]
|
43 |
+
justified = []
|
44 |
time_lists = []
|
45 |
|
46 |
for audio in audios:
|
47 |
+
justified.append(left_justified(audio))
|
48 |
time_lists.append(gain_time(audio))
|
49 |
|
50 |
+
j_time = float(justified[0]) - float(justified[1])
|
51 |
+
|
52 |
+
if float(time_lists[0]) > float(time_lists[1]):
|
53 |
+
r_time = float(min(time_lists)) / (float(max(time_lists)) - j_time)
|
54 |
+
else:
|
55 |
+
r_time = float(max(time_lists)) / float(min(time_lists))
|
56 |
return r_time
|
57 |
|
58 |
def translator(text, TR_LANGUAGE, LANGUAGE):
|
|
|
91 |
}
|
92 |
)
|
93 |
ff.run()
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
|
96 |
+
|
97 |
+
ff = ffmpy.FFmpeg(
|
98 |
inputs={
|
99 |
vocals: None
|
100 |
},
|
|
|
102 |
vocals_monorail: ['-y', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
|
103 |
}
|
104 |
)
|
105 |
+
ff.run()
|
106 |
+
|
107 |
+
client = Client('https://hf-audio-whisper-large-v3.hf.space/')
|
108 |
+
result = client.predict(
|
109 |
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
|
110 |
'transcribe', # str in 'Task' Radio component
|
111 |
api_name='/predict'
|
112 |
+
)
|
113 |
+
except ffmpy.FFRuntimeError:
|
114 |
+
raise gr.Error('Mismatched audio!')
|
115 |
+
except client.RemoteDisconnected as e:
|
116 |
+
raise gr.Error(e)
|
117 |
|
118 |
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
|
119 |
|
|
|
122 |
await communicate.save(text_to_speech)
|
123 |
asyncio.run(amain())
|
124 |
|
125 |
+
j_time, r_time = time_verify()
|
126 |
ff = ffmpy.FFmpeg(
|
127 |
+
inputs={
|
128 |
+
text_to_speech: None
|
129 |
+
},
|
130 |
outputs={
|
131 |
+
output_rate_audio: ['-y', '-filter:a', f'atempo={r_time}']
|
132 |
}
|
133 |
)
|
134 |
ff.run()
|
135 |
+
if j_time > 0:
|
136 |
+
ff = ffmpy.FFmpeg(
|
137 |
+
inputs={
|
138 |
+
output_rate_audio: None
|
139 |
+
},
|
140 |
+
outputs={
|
141 |
+
output_left_audio: ['-y', '-af', f'areverse,apad=pad_dur={j_time}s,areverse']
|
142 |
+
}
|
143 |
+
)
|
144 |
+
ff.run()
|
145 |
+
else:
|
146 |
+
ff = ffmpy.FFmpeg(
|
147 |
+
inputs={
|
148 |
+
output_rate_audio: None
|
149 |
+
},
|
150 |
+
outputs={
|
151 |
+
output_left_audio: ['-y', '-filter:a', f'atrim=start={abs(j_time)}']
|
152 |
+
}
|
153 |
+
)
|
154 |
+
ff.run()
|
155 |
|
156 |
ff = ffmpy.FFmpeg(
|
157 |
inputs={
|
158 |
+
output_left_audio: None,
|
159 |
accompaniment: None
|
160 |
},
|
161 |
outputs={
|
|
|
170 |
)
|
171 |
ff.run()
|
172 |
|
173 |
+
return output_video, accompaniment, vocals_monorail, output_left_audio, text_to_speech, result, ts_text
|
174 |
|
175 |
with gr.Blocks() as demo:
|
176 |
TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
|
|
|
186 |
],
|
187 |
outputs=[
|
188 |
gr.Video(height=320, width=600, label='Output_video'),
|
189 |
+
gr.Audio(label='Accompaniment'),
|
190 |
+
gr.Audio(label='Vocals'),
|
191 |
+
gr.Audio(label='Vocals_justified'),
|
192 |
+
gr.Audio(label='Text_speech'),
|
193 |
+
gr.Text(label='Original'),
|
194 |
+
gr.Text(label='Translation'),
|
195 |
],
|
196 |
title="Short-Video-To-Video",
|
197 |
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator, Use other [translators](https://github.com/DUQIA/Short-Video-To-Video/blob/main/README.md#use-other-translators). Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
|