# Imports import gradio as gr import spaces import torch import numpy as np from kokoro import KModel, KPipeline # Pre-Initialize DEVICE = "auto" if DEVICE == "auto": DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"[SYSTEM] | Using {DEVICE} type compute device.") # Variables SILENT_THRESHOLD = 0.01 CHAR_LIMIT = 2000 DEFAULT_INPUT = "" DEFAULT_VOICE = "af_heart" CHOICES = { "πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️": "af_heart", "πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella", "πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧": "af_nicole", "πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede", "πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore", "πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah", "πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova", "πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky", "πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy", "πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica", "πŸ‡ΊπŸ‡Έ 🚺 River": "af_river", "πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael", "πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir", "πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck", "πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo", "πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric", "πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam", "πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx", "πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa", "πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam", "πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma", "πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella", "πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice", "πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily", "πŸ‡¬πŸ‡§ 🚹 George": "bm_george", "πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable", "πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis", "πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel", } PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"} PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO" PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ" for v in CHOICES.values(): PIPELINES[v[0]].load_voice(v) MODEL = KModel().eval() css = ''' .gradio-container{max-width: 560px !important} h1{text-align:center} footer { visibility: hidden } ''' # Functions def trim_silence(audio, threshold=SILENT_THRESHOLD): abs_audio = np.abs(audio) indices = np.where(abs_audio > threshold)[0] if len(indices) == 0: return audio start = indices[0] end = indices[-1] + 1 return audio[start:end] def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1): text = text.strip()[:CHAR_LIMIT] + "." pipeline = PIPELINES[voice[0]] pack = pipeline.load_voice(voice) for _, ps, _ in pipeline(text, voice, speed): ref_s = pack[len(ps) - 1] audio = MODEL(ps, ref_s, speed) return (24000, trim_silence(audio.numpy())) def cloud(): print("[CLOUD] | Space maintained.") @spaces.GPU() def gpu(): return # Initialize with gr.Blocks(css=css) as main: with gr.Column(): gr.Markdown("πŸͺ„ Instantly generate realistic voices using text input.") with gr.Column(): input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input") voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice") speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed") submit = gr.Button("β–Ά") maintain = gr.Button("☁️") with gr.Column(): output = gr.Audio(label="Output") submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output) maintain.click(cloud, inputs=[], outputs=[], queue=False) main.launch(show_api=True)