ximod1a commited on
Commit
ef48eb4
·
1 Parent(s): 8d36ddd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpy
2
+ import asyncio
3
+ import edge_tts
4
+ import subprocess
5
+ import gradio as gr
6
+ import translators as ts
7
+ from gradio_client import Client
8
+ from list_dict import translates, speakers
9
+
10
+ translate = translates
11
+ language = translate[list(translate.keys())[9]]
12
+ speaker = speakers
13
+
14
+ file_name = 'audio'
15
+ main_video = 'video.mp4'
16
+ main_audio = f'{file_name}.wav'
17
+ folder = 'output_directory'
18
+ text_to_speech = 'text_to_speech.wav'
19
+ vocals = f'./{folder}/{file_name}/vocals.wav'
20
+ vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
21
+ accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
22
+ output_rate_audio = 'output_rate_audio.wav'
23
+ output_audio = 'output.wav'
24
+ output_video = 'output.mp4'
25
+
26
+ def gain_time(audio):
27
+ command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio]
28
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
29
+ return float(result.stdout)
30
+
31
+ def time_verify():
32
+ audios = [vocals_monorail, text_to_speech]
33
+ time_lists = []
34
+
35
+ for audio in audios:
36
+ time_lists.append(gain_time(audio))
37
+
38
+ r_time = float(max(time_lists)) / float(min(time_lists))
39
+ return r_time
40
+
41
+ def translator(text, TR_LANGUAGE, LANGUAGE):
42
+ ts_text = ts.translate_text(text, translator=TR_LANGUAGE, from_language='auto', to_language=language[LANGUAGE])
43
+ return ts_text
44
+
45
+ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
46
+ language = translate[TR_LANGUAGE]
47
+
48
+ if video is None:
49
+ raise gr.Error('No audio file submitted!')
50
+ elif language is None:
51
+ raise gr.Error('Please select google translator!')
52
+ elif TR_LANGUAGE != 'google':
53
+ raise gr.Error('Language has been reloaded, please select again!')
54
+ elif float(gain_time(video)) > 60:
55
+ raise gr.Error('Exceed maximum limit!')
56
+
57
+ ff = ffmpy.FFmpeg(
58
+ inputs={
59
+ video: None
60
+ },
61
+ outputs={
62
+ main_video: ['-y', '-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
63
+ main_audio: ['-y', '-map', '0:a', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
64
+ }
65
+ )
66
+ ff.run()
67
+
68
+ subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
69
+
70
+ ff = ffmpy.FFmpeg(
71
+ inputs={
72
+ vocals: None
73
+ },
74
+ outputs={
75
+ vocals_monorail: ['-y', '-i', vocals, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
76
+ }
77
+ )
78
+ ff.run()
79
+
80
+ client = Client('https://hf-audio-whisper-large-v3.hf.space/')
81
+ result = client.predict(
82
+ vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
83
+ 'transcribe', # str in 'Task' Radio component
84
+ api_name='/predict'
85
+ )
86
+
87
+ ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
88
+
89
+ async def amain():
90
+ communicate = edge_tts.Communicate(ts_text, SPEAKER)
91
+ await communicate.save(text_to_speech)
92
+ asyncio.run(amain())
93
+
94
+ rate_audio = time_verify()
95
+ ff = ffmpy.FFmpeg(
96
+ inputs={text_to_speech: None},
97
+ outputs={
98
+ output_rate_audio: ['-y', '-filter:a', f'atempo={rate_audio}']
99
+ }
100
+ )
101
+ ff.run()
102
+
103
+ ff = ffmpy.FFmpeg(
104
+ inputs={
105
+ output_rate_audio: None,
106
+ accompaniment: None
107
+ },
108
+ outputs={
109
+ output_audio: '-y -filter_complex amix=inputs=2'
110
+ }
111
+ )
112
+ ff.run()
113
+
114
+ ff = ffmpy.FFmpeg(
115
+ inputs={output_audio: None, main_video: None},
116
+ outputs={output_video: '-y -c:v copy -c:a aac -strict experimental'}
117
+ )
118
+ ff.run()
119
+
120
+ return output_video
121
+
122
+ with gr.Blocks() as demo:
123
+ TR_LANGUAGE = gr.Dropdown(translate, value=list(translate.keys())[9], label='Translator')
124
+ LANGUAGE = gr.Dropdown(language, value=list(language.keys())[0], label='Language')
125
+ SPEAKER = gr.Dropdown(speaker, value=speaker[0], label='Speaker')
126
+ gr.Interface(
127
+ fn=video_inputs,
128
+ inputs=[
129
+ gr.Video(height=320, width=600, interactive=True, label='Input_video'),
130
+ TR_LANGUAGE,
131
+ LANGUAGE,
132
+ SPEAKER,
133
+ ],
134
+ outputs=[
135
+ gr.Video(height=320, width=600, label='Output_video'),
136
+ ],
137
+ title="Short-Video-To-Video",
138
+ description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
139
+ )
140
+ demo.launch()