Spaces:
Running
Running
File size: 4,527 Bytes
ef48eb4 c5e76d2 ef48eb4 c5e76d2 ef48eb4 c5e76d2 ef48eb4 c5e76d2 ef48eb4 0ed3c79 ef48eb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import ffmpy
import asyncio
import edge_tts
import subprocess
import gradio as gr
import translators as ts
from gradio_client import Client
from list_dict import translates, speakers
translate = translates
tr = list(translate.keys())[9]
language = translate[tr]
la = list(language.keys())[0]
speaker = speakers
sp = speaker[0]
gl = True
file_name = 'audio'
main_video = 'video.mp4'
main_audio = f'{file_name}.wav'
folder = 'output_directory'
text_to_speech = 'text_to_speech.wav'
vocals = f'./{folder}/{file_name}/vocals.wav'
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
output_rate_audio = 'output_rate_audio.wav'
output_audio = 'output.wav'
output_video = 'output.mp4'
def gain_time(audio):
command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return float(result.stdout)
def time_verify():
audios = [vocals_monorail, text_to_speech]
time_lists = []
for audio in audios:
time_lists.append(gain_time(audio))
r_time = float(max(time_lists)) / float(min(time_lists))
return r_time
def translator(text, TR_LANGUAGE, LANGUAGE):
ts_text = ts.translate_text(text, translator=TR_LANGUAGE, from_language='auto', to_language=language[LANGUAGE])
return ts_text
def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
language = translate[TR_LANGUAGE]
if video is None:
raise gr.Error('No audio file submitted!')
elif language is None:
raise gr.Error('Please select google translator!')
elif TR_LANGUAGE != tr:
gl = False
raise gr.Error('Language has been reloaded, please select again!')
elif gl is False:
gl = True
raise gr.Error('Language has been reloaded, please select again!')
elif float(gain_time(video)) > 60:
raise gr.Error('Exceed maximum limit!')
ff = ffmpy.FFmpeg(
inputs={
video: None
},
outputs={
main_video: ['-y', '-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
main_audio: ['-y', '-map', '0:a', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
ff = ffmpy.FFmpeg(
inputs={
vocals: None
},
outputs={
vocals_monorail: ['-y', '-i', vocals, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
client = Client('https://hf-audio-whisper-large-v3.hf.space/')
result = client.predict(
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
'transcribe', # str in 'Task' Radio component
api_name='/predict'
)
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
async def amain():
communicate = edge_tts.Communicate(ts_text, SPEAKER)
await communicate.save(text_to_speech)
asyncio.run(amain())
rate_audio = time_verify()
ff = ffmpy.FFmpeg(
inputs={text_to_speech: None},
outputs={
output_rate_audio: ['-y', '-filter:a', f'atempo={rate_audio}']
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={
output_rate_audio: None,
accompaniment: None
},
outputs={
output_audio: '-y -filter_complex amix=inputs=2'
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={output_audio: None, main_video: None},
outputs={output_video: '-y -c:v copy -c:a aac -strict experimental'}
)
ff.run()
return output_video
with gr.Blocks() as demo:
TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
LANGUAGE = gr.Dropdown(language, value=la, label='Language')
SPEAKER = gr.Dropdown(speaker, value=sp, label='Speaker')
gr.Interface(
fn=video_inputs,
inputs=[
gr.Video(height=320, width=600, interactive=True, label='Input_video'),
TR_LANGUAGE,
LANGUAGE,
SPEAKER,
],
outputs=[
gr.Video(height=320, width=600, label='Output_video'),
],
title="Short-Video-To-Video",
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator. Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
)
demo.launch() |