import warnings
warnings.filterwarnings("ignore")

# 外部库
import re
import requests
import argparse
import json
import os
import re
import tempfile
import librosa
import numpy as np
# import torch
# from torch import no_grad, LongTensor
# import commons
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils

# 内部库
# from models import SynthesizerTrn
# from text import text_to_sequence, text_to_sequence_for_test, _clean_text
# from mel_processing import spectrogram_torch
# import utils
# from text.symbols import symbols
all_example = "my voice is my passport verify me."

eleven_voice_id = [
    "21m00Tcm4TlvDq8ikWAM",
    "29vD33N1CtxCmqQRPOHJ",
    "2EiwWnXFnvU5JabPnv8n",
    "5Q0t7uMcjvnagumLfvZi",
    "AZnzlk1XvdvUeBnXmlld",
    "CYw3kZ02Hs0563khs1Fj",
    "D38z5RcWu1voky8WS1ja",
    "EXAVITQu4vr4xnSDxMaL",
    "ErXwobaYiN019PkySvjV",
    "GBv7mTt0atIp3Br8iCZE",
    "IKne3meq5aSn9XLyUdCD",
    "JBFqnCBsd6RMkjVDRZzb",
    "LcfcDJNUP1GQjkzn1xUU",
    "MF3mGyEYCl7XYWbV9V6O",
    "N2lVS1w4EtoT3dr4eOWO",
    "ODq5zmih8GrVes37Dizd",
    "SOYHLrjzK2X1ezoPC6cr",
    "TX3LPaxmHKxFdv7VOQHJ",
    "ThT5KcBeYPX3keUQqHPh",
    "TxGEqnHWrfWFTfGW9XjX",
    "VR6AewLTigWG4xSOukaG",
    "XB0fDUnXU5powFXDhCwa",
    "Xb7hH8MSUJpSbSDYk0k2",
    "XrExE9yKIg1WjnnlVkGX",
    "ZQe5CZNOzWyzPSCn5a3c",
    "Zlb1dXrM653N07WRdFW3",
    "bVMeCyTHy58xNoL34h3p",
    "flq6f7yk4E4fJM5XTYuZ",
    "g5CIjZEefAph4nQFvHAz",
    "iP95p4xoKVk53GoZ742B",
    "jBpfuIE2acCO8z3wKNLl",
    "jsCqWAovK2LkecY7zXl4",
    "nPczCjzI2devNBz1zQrb",
    "oWAxZDx7w5VEj9dCyTzz",
    "onwK4e9ZLuTAKqWW03F9",
    "pFZP5JQG7iQjIQuC4Bku",
    "pMsXgVXv3BLzUgSXRplE",
    "pNInz6obpgDQGcFmaJgB",
    "piTKgcLEGmPE4e6mEKli",
    "pqHfZKP75CvOlQylNhV4",
    "t0jbNlBVZ17f02VDIeMI",
    "yoZ06aMxZJJ28mfd3POQ",
    "z9fAnlkpzviPz146aGWa",
    "zcAOhNBS3c14rBihAFp1",
    "zrHiDhphv9ZnVXBqCLjz",
]

eleven_name = [
    "Rachel",
    "Drew",
    "Clyde",
    "Paul",
    "Domi",
    "Dave",
    "Fin",
    "Sarah",
    "Antoni",
    "Thomas",
    "Charlie",
    "George",
    "Emily",
    "Elli",
    "Callum",
    "Patrick",
    "Harry",
    "Liam",
    "Dorothy",
    "Josh",
    "Arnold",
    "Charlotte",
    "Alice",
    "Matilda",
    "James",
    "Joseph",
    "Jeremy",
    "Michael",
    "Ethan",
    "Chris",
    "Gigi",
    "Freya",
    "Brian",
    "Grace",
    "Daniel",
    "Lily",
    "Serena",
    "Adam",
    "Nicole",
    "Bill",
    "Jessie",
    "Sam",
    "Glinda",
    "Giovanni",
    "Mimi",
]
eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id))

def openai(text, name):

    headers = {
        'Authorization': 'Bearer ' + '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',
        'Content-Type': 'application/json',
    }

    json_data = {
        'model': 'tts-1-hd',
        'input': f'{text}',
        'voice': f'{name}',
    }

    response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data)

    # Note: json_data will not be serialized by requests
    # exactly as it was in the original request.
    #data = '{\n    "model": "tts-1",\n    "input": "The quick brown fox jumped over the lazy dog.",\n    "voice": "alloy"\n  }'
    #response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data)

    return "Success", response

def elevenlabs(text,name):
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{name}"
    CHUNK_SIZE = 1024
    #url = "https://api.elevenlabs.io/v1/text-to-speech/<voice-id>"

    headers = {
    "Accept": "audio/mpeg",
    "Content-Type": "application/json",
    "xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe"
    }

    data = {
    "text": f"{text}",
    "model_id": "eleven_monolingual_v1",
    "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.5
    }
    }

    response = requests.post(url, json=data, headers=headers)
    # with open('output.mp3', 'wb') as f:
    #     for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
    #         if chunk:
    #             f.write(chunk)
    return "Success", response

microsoft_model_list = [
    "en-US-AvaMultilingualNeural"
]

def microsoft(text, name, style="Neural"):
    """
    :param text:
    :param name:
    :param style:
    :return:
    """
    headers = {
        'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
        'Content-Type': 'application/ssml+xml',
        'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
        'User-Agent': 'curl',
    }

    data = ("<speak version='1.0' xml:lang='en-US'>"
        f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female' 
        f"{text}"
        "</voice>"
        "</speak>")

    response = requests.post(
        'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
        headers=headers,
        data=data,
    )
    # data = {
    #     "text":text,
    #     "name":name,
    #     "style":style,
    #     "format":"mp3"}
    # audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url']
    return "Success", response
    
# def google(text,name):
#     # import subprocess
#     # command1 = subprocess.run('gcloud auth print-access-token', shell=True, capture_output=True, text=True).stdout

#     headers = {
#         'Authorization': 'Bearer ' + "synclub-2383kjhjksxfv.2341gs",
#         'x-goog-user-project': 'PROJECT_ID',
#         'Content-Type': 'application/json; charset=utf-8',
#     }

#     data = {
#             "input": {
#                 "text": f"{text}"},
    #         "voice": {
    #             "languageCode": "en-gb",
    #             "name": "en-GB-Standard-A",
    #             "ssmlGender": "FEMALE"
    #         },
    #         "audioConfig": {
    #             "audioEncoding": "MP3"
    #         }
    # }

    # response = requests.post('https://texttospeech.googleapis.com/v1/text:synthesize', headers=headers, data=data)
    # return "Success", response
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument("--share", action="store_true", default=True, help="share gradio app")
    parser.add_argument("--port", type=int, default=8081, help="port")
    parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
    args = parser.parse_args()
    # app = gr.Blocks()
    # with app:
    #     gr.Markdown("## Japanese TTS Demo")
    #     with gr.Tabs():
    #         with gr.TabItem("微软"):
    #             tts_input1 = gr.TextArea(label="Text", value=all_example)
    #             tts_input2 = gr.Dropdown(microsoft_model_list, label="name")                             
    #             tts_submit = gr.Button("Generate", variant="primary")
    #             tts_output1 = gr.Textbox(label="Output Message")
    #             tts_output2 = gr.Audio(label="Output Audio")
    #             tts_submit.click(microsoft, [tts_input1, tts_input2],
    #                                 [tts_output1, tts_output2])
    # _, audio = microsoft(all_example, 'en-US-AvaMultilingualNeural')
    # _, audio = google(all_example,'alloy')
    # print(audio)
    # with open("test4.mp3", "wb") as f:
    #     f.write(audio.content)
    #_, audio = elevenlabs(all_example, "21m00Tcm4TlvDq8ikWAM")
    # print(audio)
    # with open('output.mp3', 'wb') as f:
    #     for chunk in audio.iter_content(chunk_size=1024):
    #         if chunk:
    #             f.write(chunk)

    # device = torch.device(args.device)
    # models_tts = []

    # with open(args.model_info_path, "r", encoding="utf-8") as f:
    #     models_info = json.load(f)
    # for i, info in models_info.items():
        # model_name = info["model_name"]
        # author = info["author"]
        # lang = info["lang"]
        # example = info["example"]
        # config_path = info["config_path"]
        # model_path = info["model_path"]
        # model_type = info["model_type"]

        # hps = utils.get_hparams_from_file(config_path)
        # if model_type == "vits":
        #     emotion_type = None
        # elif model_type == "vits-emotion":
        #     emotion_type = "embedding"
        # elif model_type == "vits-emotion-logits":
        #     emotion_type = "logits"

        # model = SynthesizerTrn(
        #     len(symbols),
        #     hps.data.filter_length // 2 + 1,
        #     hps.train.segment_size // hps.data.hop_length,
        #     emotion_type=emotion_type,
        #     **hps.model)

        # utils.load_checkpoint(model_path, model, None)
        # model.eval().to(device)
        # if model_type == "vits":
        #     # 普通TTS
        #     models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps)))
    
    
    app = gr.Blocks()
    with app:
        gr.Markdown("## Japanese TTS Demo")
        with gr.Tabs():
            # with gr.TabItem("自研"):
            #     with gr.Tabs():
            #         for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts):
            #             with gr.TabItem(model_name):
            #                 with gr.Column():
            #                     tts_input1 = gr.TextArea(label="Text", value=example)
            #                     tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1)
            #                     tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1)
            #                     tts_input4 = gr.Slider(label="noise_scale_w", value=0.0,
            #                                            minimum=0.0, maximum=2, step=0.1)
            #                     tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1)                                
            #                     tts_submit = gr.Button("Generate", variant="primary")
            #                     tts_output1 = gr.Textbox(label="Output Message")
            #                     tts_output2 = gr.Audio(label="Output Audio")
            #                     tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5],
            #                                      [tts_output1, tts_output2])
            
            # with gr.TabItem("谷歌"):
            #     tts_input1 = gr.TextArea(label="Text", value=all_example)
            #     tts_input2 = gr.Dropdown(google_model_list, label="name")                             
            #     tts_submit = gr.Button("Generate", variant="primary")
            #     tts_output1 = gr.Textbox(label="Output Message")
            #     tts_output2 = gr.Audio(label="Output Audio")
            #     tts_submit.click(google, [tts_input1, tts_input2],
            #                         [tts_output1, tts_output2])

            with gr.TabItem("微软"):
                tts_input1 = gr.TextArea(label="Text", value=all_example)
                tts_input2 = gr.Dropdown(microsoft_model_list, label="name")                             
                tts_submit = gr.Button("Generate", variant="primary")
                tts_output1 = gr.Textbox(label="Output Message")
                tts_output2 = gr.Audio(label="Output Audio")
                tts_submit.click(microsoft, [tts_input1, tts_input2],
                                    [tts_output1, tts_output2])
                
            # with gr.TabItem("coefont"):
            #     tts_input1 = gr.TextArea(label="Text", value=all_example)
            #     tts_input2 = gr.Dropdown(coefont_model_list, label="name")
            #     tts_submit = gr.Button("Generate", variant="primary")
            #     tts_output1 = gr.Textbox(label="Output Message")
            #     tts_output2 = gr.Audio(label="Output Audio")
            #     tts_submit.click(coefont, [tts_input1, tts_input2],
            #                         [tts_output1, tts_output2])

    app.launch(show_api=False,
                share=args.share,
                server_name='0.0.0.0',
                server_port=args.port,
                show_error=True)