ximod1a commited on
Commit
c9e3769
·
1 Parent(s): c6ac616

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -16
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import ffmpy
2
  import asyncio
3
  import edge_tts
@@ -22,6 +23,7 @@ text_to_speech = 'text_to_speech.wav'
22
  vocals = f'./{folder}/{file_name}/vocals.wav'
23
  vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
24
  accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
 
25
  output_rate_audio = 'output_rate_audio.wav'
26
  output_audio = 'output.wav'
27
  output_video = 'output.mp4'
@@ -31,14 +33,26 @@ def gain_time(audio):
31
  result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
32
  return float(result.stdout)
33
 
 
 
 
 
 
34
  def time_verify():
35
  audios = [vocals_monorail, text_to_speech]
 
36
  time_lists = []
37
 
38
  for audio in audios:
 
39
  time_lists.append(gain_time(audio))
40
 
41
- r_time = float(max(time_lists)) / float(min(time_lists))
 
 
 
 
 
42
  return r_time
43
 
44
  def translator(text, TR_LANGUAGE, LANGUAGE):
@@ -77,12 +91,10 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
77
  }
78
  )
79
  ff.run()
80
- except ffmpy.FFRuntimeError:
81
- raise gr.Error('Mismatched audio!')
82
-
83
- subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
84
 
85
- ff = ffmpy.FFmpeg(
 
 
86
  inputs={
87
  vocals: None
88
  },
@@ -90,14 +102,18 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
90
  vocals_monorail: ['-y', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
91
  }
92
  )
93
- ff.run()
94
-
95
- client = Client('https://hf-audio-whisper-large-v3.hf.space/')
96
- result = client.predict(
97
  vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
98
  'transcribe', # str in 'Task' Radio component
99
  api_name='/predict'
100
- )
 
 
 
 
101
 
102
  ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
103
 
@@ -106,18 +122,40 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
106
  await communicate.save(text_to_speech)
107
  asyncio.run(amain())
108
 
109
- rate_audio = time_verify()
110
  ff = ffmpy.FFmpeg(
111
- inputs={text_to_speech: None},
 
 
112
  outputs={
113
- output_rate_audio: ['-y', '-filter:a', f'atempo={rate_audio}']
114
  }
115
  )
116
  ff.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  ff = ffmpy.FFmpeg(
119
  inputs={
120
- output_rate_audio: None,
121
  accompaniment: None
122
  },
123
  outputs={
@@ -132,7 +170,7 @@ def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
132
  )
133
  ff.run()
134
 
135
- return output_video
136
 
137
  with gr.Blocks() as demo:
138
  TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
@@ -148,6 +186,12 @@ with gr.Blocks() as demo:
148
  ],
149
  outputs=[
150
  gr.Video(height=320, width=600, label='Output_video'),
 
 
 
 
 
 
151
  ],
152
  title="Short-Video-To-Video",
153
  description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator, Use other [translators](https://github.com/DUQIA/Short-Video-To-Video/blob/main/README.md#use-other-translators). Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
 
1
+ import re
2
  import ffmpy
3
  import asyncio
4
  import edge_tts
 
23
  vocals = f'./{folder}/{file_name}/vocals.wav'
24
  vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
25
  accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
26
+ output_left_audio = 'output_left_audio.wav'
27
  output_rate_audio = 'output_rate_audio.wav'
28
  output_audio = 'output.wav'
29
  output_video = 'output.mp4'
 
33
  result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
34
  return float(result.stdout)
35
 
36
+ def left_justified(audio):
37
+ command = ['ffmpeg', '-i', audio, '-af', 'silencedetect=n=-38dB:d=0.01', '-f', 'null', '-']
38
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
39
+ return re.search(r'silence_duration: (\d.\d+)', result.stdout.decode(), re.M|re.S).group(1)
40
+
41
  def time_verify():
42
  audios = [vocals_monorail, text_to_speech]
43
+ justified = []
44
  time_lists = []
45
 
46
  for audio in audios:
47
+ justified.append(left_justified(audio))
48
  time_lists.append(gain_time(audio))
49
 
50
+ j_time = float(justified[0]) - float(justified[1])
51
+
52
+ if float(time_lists[0]) > float(time_lists[1]):
53
+ r_time = float(min(time_lists)) / (float(max(time_lists)) - j_time)
54
+ else:
55
+ r_time = float(max(time_lists)) / float(min(time_lists))
56
  return r_time
57
 
58
  def translator(text, TR_LANGUAGE, LANGUAGE):
 
91
  }
92
  )
93
  ff.run()
 
 
 
 
94
 
95
+ subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
96
+
97
+ ff = ffmpy.FFmpeg(
98
  inputs={
99
  vocals: None
100
  },
 
102
  vocals_monorail: ['-y', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
103
  }
104
  )
105
+ ff.run()
106
+
107
+ client = Client('https://hf-audio-whisper-large-v3.hf.space/')
108
+ result = client.predict(
109
  vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
110
  'transcribe', # str in 'Task' Radio component
111
  api_name='/predict'
112
+ )
113
+ except ffmpy.FFRuntimeError:
114
+ raise gr.Error('Mismatched audio!')
115
+ except client.RemoteDisconnected as e:
116
+ raise gr.Error(e)
117
 
118
  ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
119
 
 
122
  await communicate.save(text_to_speech)
123
  asyncio.run(amain())
124
 
125
+ j_time, r_time = time_verify()
126
  ff = ffmpy.FFmpeg(
127
+ inputs={
128
+ text_to_speech: None
129
+ },
130
  outputs={
131
+ output_rate_audio: ['-y', '-filter:a', f'atempo={r_time}']
132
  }
133
  )
134
  ff.run()
135
+ if j_time > 0:
136
+ ff = ffmpy.FFmpeg(
137
+ inputs={
138
+ output_rate_audio: None
139
+ },
140
+ outputs={
141
+ output_left_audio: ['-y', '-af', f'areverse,apad=pad_dur={j_time}s,areverse']
142
+ }
143
+ )
144
+ ff.run()
145
+ else:
146
+ ff = ffmpy.FFmpeg(
147
+ inputs={
148
+ output_rate_audio: None
149
+ },
150
+ outputs={
151
+ output_left_audio: ['-y', '-filter:a', f'atrim=start={abs(j_time)}']
152
+ }
153
+ )
154
+ ff.run()
155
 
156
  ff = ffmpy.FFmpeg(
157
  inputs={
158
+ output_left_audio: None,
159
  accompaniment: None
160
  },
161
  outputs={
 
170
  )
171
  ff.run()
172
 
173
+ return output_video, accompaniment, vocals_monorail, output_left_audio, text_to_speech, result, ts_text
174
 
175
  with gr.Blocks() as demo:
176
  TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
 
186
  ],
187
  outputs=[
188
  gr.Video(height=320, width=600, label='Output_video'),
189
+ gr.Audio(label='Accompaniment'),
190
+ gr.Audio(label='Vocals'),
191
+ gr.Audio(label='Vocals_justified'),
192
+ gr.Audio(label='Text_speech'),
193
+ gr.Text(label='Original'),
194
+ gr.Text(label='Translation'),
195
  ],
196
  title="Short-Video-To-Video",
197
  description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator, Use other [translators](https://github.com/DUQIA/Short-Video-To-Video/blob/main/README.md#use-other-translators). Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."