Spaces:
Running
Running
File size: 7,109 Bytes
c9e3769 592028a ef48eb4 592028a ef48eb4 26d944b ef48eb4 592028a ef48eb4 c5e76d2 ef48eb4 c5e76d2 ef48eb4 1ff33b9 ef48eb4 c9e3769 ef48eb4 c9e3769 a251add c9e3769 592028a c9e3769 ef48eb4 c9e3769 ef48eb4 c9e3769 03e14f6 ef48eb4 1ff33b9 ef48eb4 1d7ea53 ef48eb4 1ff33b9 ef48eb4 4e97754 1d8f8f4 4e97754 1ff33b9 ef48eb4 810c6c0 1ff33b9 810c6c0 ef48eb4 de55538 c9e3769 ef48eb4 810c6c0 ef48eb4 c9e3769 ef48eb4 c9e3769 a251add c9e3769 26d944b b19d4ac ef48eb4 592028a ef48eb4 c9e3769 ef48eb4 6f62d68 ef48eb4 592028a 3df3272 c9e3769 6f62d68 c9e3769 6f62d68 c9e3769 ef48eb4 c9e3769 ef48eb4 c9e3769 ef48eb4 c5e76d2 ef48eb4 bf956b1 ef48eb4 bf956b1 c9e3769 ef48eb4 c6ac616 ef48eb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
import re
import os
import ffmpy
import asyncio
import tarfile
import edge_tts
import subprocess
import gradio as gr
import translators as ts
from gradio_client import Client
from requests.exceptions import ConnectionError
from list_dict import translates, speakers
if not os.path.exists('pretrained_models'):
with tarfile.open('2stems.tar.gz', 'r:gz') as tar_ref:
tar_ref.extractall('./pretrained_models/2stems')
translate = translates
tr = list(translate.keys())[9]
language = translate[tr]
la = list(language.keys())[0]
speaker = speakers
sp = speaker[0]
max_duration = 60
file_name = 'audio'
main_video = 'video.mp4'
main_audio = f'{file_name}.wav'
folder = 'output_directory'
text_to_speech = 'text_to_speech.wav'
vocals = f'./{folder}/{file_name}/vocals.wav'
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
output_left_audio = 'output_left_audio.wav'
output_rate_audio = 'output_rate_audio.wav'
output_audio = 'output.wav'
output_video = 'output.mp4'
def gain_time(audio):
command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return float(result.stdout)
def left_justified(audio):
try:
command = ['ffmpeg', '-i', audio, '-af', 'silencedetect=n=-38dB:d=0.01', '-f', 'null', '-']
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
start_justified = re.search(r'silence_duration: (\d.\d+)', result.stdout.decode(), re.M|re.S).group(1)
except AttributeError:
raise gr.Error('No start sound detected!')
return start_justified
def time_verify(vocals_audio, target_audio):
audios = [vocals_audio, target_audio]
justified = []
time_lists = []
for audio in audios:
justified.append(left_justified(audio))
time_lists.append(gain_time(audio))
j_time = float(justified[0]) - float(justified[1])
if float(time_lists[0]) > float(time_lists[1]):
r_time = float(min(time_lists)) / (float(max(time_lists)) - j_time)
else:
r_time = float(max(time_lists)) / float(min(time_lists))
return round(j_time, 6), round(r_time, 6)
def translator(text, TR_LANGUAGE, LANGUAGE):
try:
ts_text = ts.translate_text(text, translator=TR_LANGUAGE, from_language='auto', to_language=language[LANGUAGE])
except ConnectionError as i:
raise gr.Error(f'translator ConnectionError:{i}')
except ts.TranslatorError:
raise gr.Error('Translator error!')
return ts_text
def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
gl = True
language = translate[TR_LANGUAGE]
get_time = float(gain_time(video))
if video is None:
raise gr.Error('No audio file submitted!')
elif language is None:
raise gr.Error('Please select google translator!')
elif SPEAKER is None:
raise gr.Error('Please select a Speaker!')
elif TR_LANGUAGE == tr:
if gl is False:
gl = True
raise gr.Error('Language has been reloaded, please select again!')
elif TR_LANGUAGE != tr:
if gl is True:
gl = False
raise gr.Error('Language has been reloaded, please select again!')
elif get_time > max_duration:
raise gr.Error('Exceed maximum limit!')
try:
ff = ffmpy.FFmpeg(
inputs={
video: f'-t {max_duration}'
},
outputs={
main_video: ['-y', '-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
main_audio: ['-y', '-map', '0:a', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
ff = ffmpy.FFmpeg(
inputs={
vocals: None
},
outputs={
vocals_monorail: ['-y', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
client = Client('https://hf-audio-whisper-large-v3.hf.space/')
result = client.predict(
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
'transcribe', # str in 'Task' Radio component
api_name='/predict'
)
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
except ffmpy.FFRuntimeError:
raise gr.Error('Mismatched audio!')
except ConnectionError as e:
raise gr.Error(f'API:{e}')
async def amain():
communicate = edge_tts.Communicate(ts_text, SPEAKER)
await communicate.save(text_to_speech)
asyncio.run(amain())
r_time = time_verify(vocals_monorail, text_to_speech)
ff = ffmpy.FFmpeg(
inputs={
text_to_speech: None
},
outputs={
output_rate_audio: ['-y', '-filter:a', f'atempo={r_time[1]}']
}
)
ff.run()
j_time = time_verify(vocals_monorail, output_rate_audio)
if j_time[0] > 0:
ff = ffmpy.FFmpeg(
inputs={
output_rate_audio: None
},
outputs={
output_left_audio: ['-y', '-af', f'areverse,apad=pad_dur={j_time[0]}s,areverse']
}
)
ff.run()
else:
ff = ffmpy.FFmpeg(
inputs={
output_rate_audio: None
},
outputs={
output_left_audio: ['-y', '-filter:a', f'atrim=start={abs(j_time[0])}']
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={
output_left_audio: None,
accompaniment: None
},
outputs={
output_audio: '-y -filter_complex amix=inputs=2'
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={output_audio: None, main_video: None},
outputs={output_video: '-y -c:v copy -c:a aac -strict experimental'}
)
ff.run()
return output_video, accompaniment, vocals_monorail, output_left_audio, text_to_speech, result, ts_text
with gr.Blocks() as demo:
TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
LANGUAGE = gr.Dropdown(language, value=la, label='Language')
SPEAKER = gr.Dropdown(speaker, value=sp, label='Speaker')
gr.Interface(
fn=video_inputs,
inputs=[
gr.Video(height=320, interactive=True, label='Input_video'),
TR_LANGUAGE,
LANGUAGE,
SPEAKER,
],
outputs=[
gr.Video(height=320, label='Output_video'),
gr.Audio(label='Accompaniment'),
gr.Audio(label='Vocals'),
gr.Audio(label='Vocals_justified'),
gr.Audio(label='Text_speech'),
gr.Text(label='Original'),
gr.Text(label='Translation'),
],
title="Short-Video-To-Video",
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator, Use other [translators](https://github.com/DUQIA/Short-Video-To-Video/blob/main/README.md#use-other-translators). Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
)
demo.launch() |