# Imports
import gradio as gr
import spaces
import torch
import numpy as np
from kokoro import KModel, KPipeline

# Pre-Initialize
DEVICE = "auto"
if DEVICE == "auto":
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")

# Variables
SILENT_THRESHOLD = 0.01
CHAR_LIMIT = 2000

DEFAULT_INPUT = ""
DEFAULT_VOICE = "af_heart"


CHOICES = {
    "🇺🇸 🚺 Heart ❤️": "af_heart",
    "🇺🇸 🚺 Bella 🔥": "af_bella",
    "🇺🇸 🚺 Nicole 🎧": "af_nicole",
    "🇺🇸 🚺 Aoede": "af_aoede",
    "🇺🇸 🚺 Kore": "af_kore",
    "🇺🇸 🚺 Sarah": "af_sarah",
    "🇺🇸 🚺 Nova": "af_nova",
    "🇺🇸 🚺 Sky": "af_sky",
    "🇺🇸 🚺 Alloy": "af_alloy",
    "🇺🇸 🚺 Jessica": "af_jessica",
    "🇺🇸 🚺 River": "af_river",
    "🇺🇸 🚹 Michael": "am_michael",
    "🇺🇸 🚹 Fenrir": "am_fenrir",
    "🇺🇸 🚹 Puck": "am_puck",
    "🇺🇸 🚹 Echo": "am_echo",
    "🇺🇸 🚹 Eric": "am_eric",
    "🇺🇸 🚹 Liam": "am_liam",
    "🇺🇸 🚹 Onyx": "am_onyx",
    "🇺🇸 🚹 Santa": "am_santa",
    "🇺🇸 🚹 Adam": "am_adam",
    "🇬🇧 🚺 Emma": "bf_emma",
    "🇬🇧 🚺 Isabella": "bf_isabella",
    "🇬🇧 🚺 Alice": "bf_alice",
    "🇬🇧 🚺 Lily": "bf_lily",
    "🇬🇧 🚹 George": "bm_george",
    "🇬🇧 🚹 Fable": "bm_fable",
    "🇬🇧 🚹 Lewis": "bm_lewis",
    "🇬🇧 🚹 Daniel": "bm_daniel",
}

PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"

for v in CHOICES.values():
    PIPELINES[v[0]].load_voice(v)

MODEL = KModel().eval()
    
css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
    visibility: hidden
}
'''

# Functions
def trim_silence(audio, threshold=SILENT_THRESHOLD):
    abs_audio = np.abs(audio)
    indices = np.where(abs_audio > threshold)[0]
    
    if len(indices) == 0: return audio
        
    start = indices[0]
    end = indices[-1] + 1
    return audio[start:end]

def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1):
    text = text.strip()[:CHAR_LIMIT] + "."
    pipeline = PIPELINES[voice[0]]
    pack = pipeline.load_voice(voice)
    
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        return (24000, trim_silence(audio.numpy()))

def cloud():
    print("[CLOUD] | Space maintained.")

@spaces.GPU()
def gpu():
    return

# Initialize
with gr.Blocks(css=css) as main:
    with gr.Column():
        gr.Markdown("🪄 Instantly generate realistic voices using text input.")
        
    with gr.Column():
        input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
        voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice")
        speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
        submit = gr.Button("▶")
        maintain = gr.Button("☁️")
        
    with gr.Column():
        output = gr.Audio(label="Output")
        
    submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output)
    maintain.click(cloud, inputs=[], outputs=[], queue=False)

main.launch(show_api=True)