Spaces:
Running
Running
File size: 4,866 Bytes
ef48eb4 c5e76d2 ef48eb4 c5e76d2 ef48eb4 1d7ea53 ef48eb4 4e97754 1d8f8f4 4e97754 ef48eb4 810c6c0 ef48eb4 810c6c0 ef48eb4 c5e76d2 ef48eb4 c6ac616 ef48eb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import ffmpy
import asyncio
import edge_tts
import subprocess
import gradio as gr
import translators as ts
from gradio_client import Client
from list_dict import translates, speakers
translate = translates
tr = list(translate.keys())[9]
language = translate[tr]
la = list(language.keys())[0]
speaker = speakers
sp = speaker[0]
file_name = 'audio'
main_video = 'video.mp4'
main_audio = f'{file_name}.wav'
folder = 'output_directory'
text_to_speech = 'text_to_speech.wav'
vocals = f'./{folder}/{file_name}/vocals.wav'
vocals_monorail = f'./{folder}/{file_name}/vocals_monorail.wav'
accompaniment = f'./{folder}/{file_name}/accompaniment.wav'
output_rate_audio = 'output_rate_audio.wav'
output_audio = 'output.wav'
output_video = 'output.mp4'
def gain_time(audio):
command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return float(result.stdout)
def time_verify():
audios = [vocals_monorail, text_to_speech]
time_lists = []
for audio in audios:
time_lists.append(gain_time(audio))
r_time = float(max(time_lists)) / float(min(time_lists))
return r_time
def translator(text, TR_LANGUAGE, LANGUAGE):
ts_text = ts.translate_text(text, translator=TR_LANGUAGE, from_language='auto', to_language=language[LANGUAGE])
return ts_text
def video_inputs(video, TR_LANGUAGE, LANGUAGE, SPEAKER):
gl = True
language = translate[TR_LANGUAGE]
if video is None:
raise gr.Error('No audio file submitted!')
elif language is None:
raise gr.Error('Please select google translator!')
elif SPEAKER is None:
raise gr.Error('Please select a Speaker!')
elif TR_LANGUAGE == tr:
if gl is False:
gl = True
raise gr.Error('Language has been reloaded, please select again!')
elif TR_LANGUAGE != tr:
if gl is True:
gl = False
raise gr.Error('Language has been reloaded, please select again!')
elif float(gain_time(video)) > 60:
raise gr.Error('Exceed maximum limit!')
try:
ff = ffmpy.FFmpeg(
inputs={
video: None
},
outputs={
main_video: ['-y', '-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
main_audio: ['-y', '-map', '0:a', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
except ffmpy.FFRuntimeError:
raise gr.Error('Mismatched audio!')
subprocess.run(['spleeter', 'separate', '-o', folder, '-p', 'spleeter:2stems-16kHz', main_audio])
ff = ffmpy.FFmpeg(
inputs={
vocals: None
},
outputs={
vocals_monorail: ['-y', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav']
}
)
ff.run()
client = Client('https://hf-audio-whisper-large-v3.hf.space/')
result = client.predict(
vocals_monorail, # str (filepath or URL to file) in 'inputs' Audio component
'transcribe', # str in 'Task' Radio component
api_name='/predict'
)
ts_text = translator(result, TR_LANGUAGE, LANGUAGE)
async def amain():
communicate = edge_tts.Communicate(ts_text, SPEAKER)
await communicate.save(text_to_speech)
asyncio.run(amain())
rate_audio = time_verify()
ff = ffmpy.FFmpeg(
inputs={text_to_speech: None},
outputs={
output_rate_audio: ['-y', '-filter:a', f'atempo={rate_audio}']
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={
output_rate_audio: None,
accompaniment: None
},
outputs={
output_audio: '-y -filter_complex amix=inputs=2'
}
)
ff.run()
ff = ffmpy.FFmpeg(
inputs={output_audio: None, main_video: None},
outputs={output_video: '-y -c:v copy -c:a aac -strict experimental'}
)
ff.run()
return output_video
with gr.Blocks() as demo:
TR_LANGUAGE = gr.Dropdown(translate, value=tr, label='Translator')
LANGUAGE = gr.Dropdown(language, value=la, label='Language')
SPEAKER = gr.Dropdown(speaker, value=sp, label='Speaker')
gr.Interface(
fn=video_inputs,
inputs=[
gr.Video(height=320, width=600, interactive=True, label='Input_video'),
TR_LANGUAGE,
LANGUAGE,
SPEAKER,
],
outputs=[
gr.Video(height=320, width=600, label='Output_video'),
],
title="Short-Video-To-Video",
description="🤗 [whisper-large-v3](https://huggingface.co/spaces/hf-audio/whisper-large-v3), Limited the video length to 60 seconds. Currently only supports google Translator, Use other [translators](https://github.com/DUQIA/Short-Video-To-Video/blob/main/README.md#use-other-translators). Please check [here](https://github.com/DUQIA/Short-Video-To-Video) for details."
)
demo.launch() |