Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ffmpy
|
2 |
+
import asyncio
|
3 |
+
import edge_tts
|
4 |
+
import subprocess
|
5 |
+
import gradio as gr
|
6 |
+
import translators as ts
|
7 |
+
from gradio_client import Client
|
8 |
+
from list_dict import translates, speakers
|
9 |
+
|
10 |
+
translate = translates
|
11 |
+
language = translate[list(translate.keys())[9]]
|
12 |
+
speaker = speakers
|
13 |
+
|
14 |
+
file_name = 'audio'
|
15 |
+
main_video = 'video.mp4'
|
16 |
+
main_audio = f'{file_name}.wav'
|
17 |
+
folder = 'output_directory'
|
18 |
+
text_to_speech = 'text_to_speech.wav'
|
19 |
+
vocals = f'./{folder}/{file_name}/vocals.wav'
|
20 |
+
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
|
21 |
+
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
|
22 |
+
output_rate_audio = 'output_rate_audio.wav'
|
23 |
+
output_audio = 'output.wav'
|
24 |
+
output_video = 'output.mp4'
|
25 |
+
|
26 |
+
def gain_time(audio):
|
27 |
+
command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio]
|
28 |
+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
29 |
+
return float(result.stdout)
|
30 |
+
|
31 |
+
def time_verify():
|
32 |
+
audios = [vocals_monorail, text_to_speech]
|
33 |
+
time_lists = []
|
34 |
+
|
35 |
+
for audio in audios:
|
36 |
+
time_lists.append(gain_time(audio))
|
37 |
+
|
38 |
+
r_time = float(max(time_lists)) / float(min(time_lists))
|
39 |
+
return r_time
|
40 |
+
|
41 |
+
def translator(text, TR_LANGUAGE, LANGUAGE):
|
42 |
+
ts_text = ts.translate_text(text, translator=TR_LANGUAGE, from_language='auto', to_language=language[LANGUAGE])
|
43 |
+
return ts_text
|
44 |
+
|
45 |
+
def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
|
46 |
+
language = translate[TR_LANGUAGE]
|
47 |
+
|
48 |
+
if video is None:
|
49 |
+
raise gr.Error('No audio file submitted!')
|
50 |
+
elif language is None:
|
51 |
+
raise gr.Error('Please select google translator!')
|
52 |
+
elif TR_LANGUAGE != 'google':
|
53 |
+
raise gr.Error('Language has been reloaded, please select again!')
|
54 |
+
elif float(gain_time(video)) > 60:
|
55 |
+
raise gr.Error('Exceed maximum limit!')
|
56 |
+
|
57 |
+
ff = ffmpy.FFmpeg(
|
58 |
+
inputs={
|
59 |
+
video: None
|
60 |
+
},
|
61 |
+
outputs={
|
62 |
+
main_video: ['-y', '-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
|
63 |
+
main_audio: ['-y', '-map', '0:a', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
|
64 |
+
}
|
65 |
+
)
|
66 |
+
ff.run()
|
67 |
+
|
68 |
+
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
|
69 |
+
|
70 |
+
ff = ffmpy.FFmpeg(
|
71 |
+
inputs={
|
72 |
+
vocals: None
|
73 |
+
},
|
74 |
+
outputs={
|
75 |
+
vocals_monorail: ['-y', '-i', vocals, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
|
76 |
+
}
|
77 |
+
)
|
78 |
+
ff.run()
|
79 |
+
|
80 |
+
client = Client('https://hf-audio-whisper-large-v3.hf.space/')
|
81 |
+
result = client.predict(
|
82 |
+
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
|
83 |
+
'transcribe', # str in 'Task' Radio component
|
84 |
+
api_name='/predict'
|
85 |
+
)
|
86 |
+
|
87 |
+
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
|
88 |
+
|
89 |
+
async def amain():
|
90 |
+
communicate = edge_tts.Communicate(ts_text, SPEAKER)
|
91 |
+
await communicate.save(text_to_speech)
|
92 |
+
asyncio.run(amain())
|
93 |
+
|
94 |
+
rate_audio = time_verify()
|
95 |
+
ff = ffmpy.FFmpeg(
|
96 |
+
inputs={text_to_speech: None},
|
97 |
+
outputs={
|
98 |
+
output_rate_audio: ['-y', '-filter:a', f'atempo={rate_audio}']
|
99 |
+
}
|
100 |
+
)
|
101 |
+
ff.run()
|
102 |
+
|
103 |
+
ff = ffmpy.FFmpeg(
|
104 |
+
inputs={
|
105 |
+
output_rate_audio: None,
|
106 |
+
accompaniment: None
|
107 |
+
},
|
108 |
+
outputs={
|
109 |
+
output_audio: '-y -filter_complex amix=inputs=2'
|
110 |
+
}
|
111 |
+
)
|
112 |
+
ff.run()
|
113 |
+
|
114 |
+
ff = ffmpy.FFmpeg(
|
115 |
+
inputs={output_audio: None, main_video: None},
|
116 |
+
outputs={output_video: '-y -c:v copy -c:a aac -strict experimental'}
|
117 |
+
)
|
118 |
+
ff.run()
|
119 |
+
|
120 |
+
return output_video
|
121 |
+
|
122 |
+
with gr.Blocks() as demo:
|
123 |
+
TR_LANGUAGE = gr.Dropdown(translate, value=list(translate.keys())[9], label='Translator')
|
124 |
+
LANGUAGE = gr.Dropdown(language, value=list(language.keys())[0], label='Language')
|
125 |
+
SPEAKER = gr.Dropdown(speaker, value=speaker[0], label='Speaker')
|
126 |
+
gr.Interface(
|
127 |
+
fn=video_inputs,
|
128 |
+
inputs=[
|
129 |
+
gr.Video(height=320, width=600, interactive=True, label='Input_video'),
|
130 |
+
TR_LANGUAGE,
|
131 |
+
LANGUAGE,
|
132 |
+
SPEAKER,
|
133 |
+
],
|
134 |
+
outputs=[
|
135 |
+
gr.Video(height=320, width=600, label='Output_video'),
|
136 |
+
],
|
137 |
+
title="Short-Video-To-Video",
|
138 |
+
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
|
139 |
+
)
|
140 |
+
demo.launch()
|