Spaces:
Running
Running
File size: 4,353 Bytes
ef48eb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import ffmpy
import asyncio
import edge_tts
import subprocess
import gradio as gr
import translators as ts
from gradio_client import Client
from list_dict import translates, speakers
translate = translates
language = translate[list(translate.keys())[9]]
speaker = speakers
file_name = 'audio'
main_video = 'video.mp4'
main_audio = f'{file_name}.wav'
folder = 'output_directory'
text_to_speech = 'text_to_speech.wav'
vocals = f'./{folder}/{file_name}/vocals.wav'
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
output_rate_audio = 'output_rate_audio.wav'
output_audio = 'output.wav'
output_video = 'output.mp4'
def gain_time(audio):
command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return float(result.stdout)
def time_verify():
audios = [vocals_monorail, text_to_speech]
time_lists = []
for audio in audios:
time_lists.append(gain_time(audio))
r_time = float(max(time_lists)) / float(min(time_lists))
return r_time
def translator(text, TR_LANGUAGE, LANGUAGE):
ts_text = ts.translate_text(text, translator=TR_LANGUAGE, from_language='auto', to_language=language[LANGUAGE])
return ts_text
def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
language = translate[TR_LANGUAGE]
if video is None:
raise gr.Error('No audio file submitted!')
elif language is None:
raise gr.Error('Please select google translator!')
elif TR_LANGUAGE != 'google':
raise gr.Error('Language has been reloaded, please select again!')
elif float(gain_time(video)) > 60:
raise gr.Error('Exceed maximum limit!')
ff = ffmpy.FFmpeg(
inputs={
video: None
},
outputs={
main_video: ['-y', '-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
main_audio: ['-y', '-map', '0:a', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
ff = ffmpy.FFmpeg(
inputs={
vocals: None
},
outputs={
vocals_monorail: ['-y', '-i', vocals, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
client = Client('https://hf-audio-whisper-large-v3.hf.space/')
result = client.predict(
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
'transcribe', # str in 'Task' Radio component
api_name='/predict'
)
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
async def amain():
communicate = edge_tts.Communicate(ts_text, SPEAKER)
await communicate.save(text_to_speech)
asyncio.run(amain())
rate_audio = time_verify()
ff = ffmpy.FFmpeg(
inputs={text_to_speech: None},
outputs={
output_rate_audio: ['-y', '-filter:a', f'atempo={rate_audio}']
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={
output_rate_audio: None,
accompaniment: None
},
outputs={
output_audio: '-y -filter_complex amix=inputs=2'
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={output_audio: None, main_video: None},
outputs={output_video: '-y -c:v copy -c:a aac -strict experimental'}
)
ff.run()
return output_video
with gr.Blocks() as demo:
TR_LANGUAGE = gr.Dropdown(translate, value=list(translate.keys())[9], label='Translator')
LANGUAGE = gr.Dropdown(language, value=list(language.keys())[0], label='Language')
SPEAKER = gr.Dropdown(speaker, value=speaker[0], label='Speaker')
gr.Interface(
fn=video_inputs,
inputs=[
gr.Video(height=320, width=600, interactive=True, label='Input_video'),
TR_LANGUAGE,
LANGUAGE,
SPEAKER,
],
outputs=[
gr.Video(height=320, width=600, label='Output_video'),
],
title="Short-Video-To-Video",
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
)
demo.launch() |