import gradio as gr
import torch
from TTS.api import TTS
import numpy as np
import tempfile
import os

# Model configurations
MODELS = {
    "Hausa": {
        "model_repo": "CLEAR-Global/TWB-Voice-Hausa-TTS-1.0",
        "model_name": "best_model_498283.pth",
        "config_name": "config.json",
        "speakers": {
            "spk_f_1": "Female",
            "spk_m_1": "Male 1",
            "spk_m_2": "Male 2"
        },
        "examples": [
            "Lokacin damuna shuka kan koriya shar.",
            "Lafiyarku tafi kuɗinku muhimmanci.",
            "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
        ]
    }
}

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
loaded_models = {}

def load_model(language):
    """Load TTS model for the specified language"""
    if language not in loaded_models:
        model_repo = MODELS[language]["model_repo"]
        model_name = MODELS[language]["model_name"] 
        config_name = MODELS[language]["config_name"]


        try:
            from huggingface_hub import hf_hub_download
            import json
            
            # First download and read the config to get the required filenames
            config_path = hf_hub_download(repo_id=model_repo, filename=config_name)
            
            with open(config_path, 'r') as f:
                config = json.load(f)
            
            # Extract filenames from config (get just the filename, not the full path)
            speakers_filename = os.path.basename(config.get("speakers_file", "speakers.pth"))
            language_ids_filename = os.path.basename(config.get("language_ids_file", "language_ids.json"))
            d_vector_filename = os.path.basename(config.get("d_vector_file", ["d_vector.pth"])[0])
            config_se_filename = os.path.basename(config.get("model_args", {}).get("speaker_encoder_config_path", "config_se.json"))
            model_se_filename = os.path.basename(config.get("model_args", {}).get("speaker_encoder_model_path", "model_se.pth"))
            
            # Download specific model and config files from HuggingFace repo
            model_path = hf_hub_download(repo_id=model_repo, filename=model_name)
            speakers_file = hf_hub_download(repo_id=model_repo, filename=speakers_filename)
            language_ids_file = hf_hub_download(repo_id=model_repo, filename=language_ids_filename)
            d_vector_file = hf_hub_download(repo_id=model_repo, filename=d_vector_filename)
            config_se_file = hf_hub_download(repo_id=model_repo, filename=config_se_filename)
            model_se_file = hf_hub_download(repo_id=model_repo, filename=model_se_filename)
            
            # Update the config paths to point to the downloaded files
            config["speakers_file"] = speakers_file
            config["language_ids_file"] = language_ids_file
            config["d_vector_file"] = [d_vector_file]
            config["model_args"]["speakers_file"] = speakers_file
            config["model_args"]["language_ids_file"] = language_ids_file
            config["model_args"]["d_vector_file"] = [d_vector_file]
            config["model_args"]["speaker_encoder_config_path"] = config_se_file
            config["model_args"]["speaker_encoder_model_path"] = model_se_file
            
            # Save the updated config to a temporary file
            import tempfile
            temp_config = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
            json.dump(config, temp_config, indent=2)
            temp_config.close()

            print(f"Loading {language} model with config:")
            print(f"- language_ids_file: {config.get('language_ids_file')}")
            print(f"- use_speaker_embedding: {config.get('use_speaker_embedding')}")
            print(f"- speakers_file: {config.get('speakers_file')}")
            print(f"- d_vector_file: {config.get('d_vector_file')}")

            # Load TTS model with specific model and config paths
            loaded_models[language] = TTS(model_path=model_path, 
                                          config_path=temp_config.name,
                                          gpu=torch.cuda.is_available())
            
        except Exception as e:
            print(f"Error loading {language} model: {e}")
            import traceback
            traceback.print_exc()
            return None

    return loaded_models[language]

def update_speakers(language):
    """Update speaker dropdown based on selected language"""
    if language in MODELS:
        speakers = MODELS[language]["speakers"]
        choices = [(f"{speaker_id}: {description}", speaker_id) 
                  for speaker_id, description in speakers.items()]
        return gr.Dropdown(choices=choices, value=choices[0][1], interactive=True)
    return gr.Dropdown(choices=[], interactive=False)

def get_example_text(language, example_idx):
    """Get example text for the selected language"""
    if language in MODELS and 0 <= example_idx < len(MODELS[language]["examples"]):
        return MODELS[language]["examples"][example_idx]
    return ""

def synthesize_speech(text, language, speaker):
    """Synthesize speech from text"""
    if not text.strip():
        return None, "Please enter some text to synthesize."
    
    # Load the model
    tts_model = load_model(language)
    if tts_model is None:
        return None, f"Failed to load {language} model."
    
    try:
        text = text.lower().strip()
        print(f"DEBUG: Processing text: '{text}'")
        print(f"DEBUG: Speaker name: '{speaker}'")
        
        synthesizer = tts_model.synthesizer
        
        try:
            wav = synthesizer.tts(text=text, speaker_name=speaker)
        except TypeError:
            wav = synthesizer.tts(text=text)
            
        print(f"DEBUG: synthesizer.tts() completed successfully")
        
        # Convert to numpy array and save to temporary file
        wav_array = np.array(wav, dtype=np.float32)
        
        # Create temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        
        # Save audio using the synthesizer's sample rate
        import scipy.io.wavfile as wavfile
        wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
        
        print("Speech synthesized successfully!")

        return temp_file.name, "Speech synthesized successfully!"
        
    except Exception as e:
        return None, f"Error during synthesis: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="TWB Voice TTS Demo") as demo:
    gr.Markdown("""
    # TWB Voice Text-to-Speech Demo Space
    
    This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global. 
    Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
    
    ### Features:
    - **Hausa**: 3 speakers (1 female, 2 male)
    - **Kanuri**: 1 female speaker
    - High-quality 24kHz audio output
    - Based on YourTTS architecture
    
    ### Links:
    - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
    - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
    - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
    - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
    - 🌐 [TWB Voice Project](https://twbvoice.org/)
    
    ---
    """)
    
    with gr.Row():
        with gr.Column():
            # Language selection
            language_dropdown = gr.Dropdown(
                choices=list(MODELS.keys()),
                value="Hausa",
                label="Language",
                info="Select the language for synthesis"
            )
            
            # Speaker selection
            speaker_dropdown = gr.Dropdown(
                choices=list(MODELS["Hausa"]["speakers"].keys()),
                value="spk_f_1",
                label="Speaker",
                info="Select the voice speaker"
            )
            
            # Text input
            text_input = gr.Textbox(
                label="Text to synthesize",
                placeholder="Enter text in the selected language (will be converted to lowercase)",
                lines=3,
                info="Note: Text will be automatically converted to lowercase as required by the models"
            )
            
            # Example buttons
            gr.Markdown("**Press to load a sentence in selected language:**")
            with gr.Row():
                example_btn_1 = gr.Button("Example 1", size="sm")
                example_btn_2 = gr.Button("Example 2", size="sm")
                example_btn_3 = gr.Button("Example 3", size="sm")
            
            # Synthesize button
            synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
            
        with gr.Column():
            # Audio output
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            
            # Status message
            status_output = gr.Textbox(
                label="Status",
                interactive=False
            )
    
    # Event handlers
    language_dropdown.change(
        fn=update_speakers,
        inputs=[language_dropdown],
        outputs=[speaker_dropdown]
    )
    
    example_btn_1.click(
        fn=lambda lang: get_example_text(lang, 0),
        inputs=[language_dropdown],
        outputs=[text_input]
    )
    
    example_btn_2.click(
        fn=lambda lang: get_example_text(lang, 1),
        inputs=[language_dropdown],
        outputs=[text_input]
    )
    
    example_btn_3.click(
        fn=lambda lang: get_example_text(lang, 2),
        inputs=[language_dropdown],
        outputs=[text_input]
    )
    
    synthesize_btn.click(
        fn=synthesize_speech,
        inputs=[text_input, language_dropdown, speaker_dropdown],
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ---
    ### Notes:
    - Models work with **lowercase input text** (automatically converted)
    - Audio output is generated at 24kHz sample rate
    
    ### License:
    This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
    
    **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
    """)

if __name__ == "__main__":
    demo.launch()