import warnings warnings.filterwarnings("ignore") # 外部库 import re import requests import argparse import json import os import re import tempfile import librosa import numpy as np # import torch # from torch import no_grad, LongTensor # import commons import gradio as gr import gradio.utils as gr_utils import gradio.processing_utils as gr_processing_utils # 内部库 # from models import SynthesizerTrn # from text import text_to_sequence, text_to_sequence_for_test, _clean_text # from mel_processing import spectrogram_torch # import utils # from text.symbols import symbols all_example = "my voice is my passport verify me." eleven_voice_id = [ "21m00Tcm4TlvDq8ikWAM", "29vD33N1CtxCmqQRPOHJ", "2EiwWnXFnvU5JabPnv8n", "5Q0t7uMcjvnagumLfvZi", "AZnzlk1XvdvUeBnXmlld", "CYw3kZ02Hs0563khs1Fj", "D38z5RcWu1voky8WS1ja", "EXAVITQu4vr4xnSDxMaL", "ErXwobaYiN019PkySvjV", "GBv7mTt0atIp3Br8iCZE", "IKne3meq5aSn9XLyUdCD", "JBFqnCBsd6RMkjVDRZzb", "LcfcDJNUP1GQjkzn1xUU", "MF3mGyEYCl7XYWbV9V6O", "N2lVS1w4EtoT3dr4eOWO", "ODq5zmih8GrVes37Dizd", "SOYHLrjzK2X1ezoPC6cr", "TX3LPaxmHKxFdv7VOQHJ", "ThT5KcBeYPX3keUQqHPh", "TxGEqnHWrfWFTfGW9XjX", "VR6AewLTigWG4xSOukaG", "XB0fDUnXU5powFXDhCwa", "Xb7hH8MSUJpSbSDYk0k2", "XrExE9yKIg1WjnnlVkGX", "ZQe5CZNOzWyzPSCn5a3c", "Zlb1dXrM653N07WRdFW3", "bVMeCyTHy58xNoL34h3p", "flq6f7yk4E4fJM5XTYuZ", "g5CIjZEefAph4nQFvHAz", "iP95p4xoKVk53GoZ742B", "jBpfuIE2acCO8z3wKNLl", "jsCqWAovK2LkecY7zXl4", "nPczCjzI2devNBz1zQrb", "oWAxZDx7w5VEj9dCyTzz", "onwK4e9ZLuTAKqWW03F9", "pFZP5JQG7iQjIQuC4Bku", "pMsXgVXv3BLzUgSXRplE", "pNInz6obpgDQGcFmaJgB", "piTKgcLEGmPE4e6mEKli", "pqHfZKP75CvOlQylNhV4", "t0jbNlBVZ17f02VDIeMI", "yoZ06aMxZJJ28mfd3POQ", "z9fAnlkpzviPz146aGWa", "zcAOhNBS3c14rBihAFp1", "zrHiDhphv9ZnVXBqCLjz", ] eleven_name = [ "Rachel", "Drew", "Clyde", "Paul", "Domi", "Dave", "Fin", "Sarah", "Antoni", "Thomas", "Charlie", "George", "Emily", "Elli", "Callum", "Patrick", "Harry", "Liam", "Dorothy", "Josh", "Arnold", "Charlotte", "Alice", "Matilda", "James", "Joseph", "Jeremy", "Michael", "Ethan", "Chris", "Gigi", "Freya", "Brian", "Grace", "Daniel", "Lily", "Serena", "Adam", "Nicole", "Bill", "Jessie", "Sam", "Glinda", "Giovanni", "Mimi", ] eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id)) def openai(text, name): headers = { 'Authorization': 'Bearer ' + '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', 'Content-Type': 'application/json', } json_data = { 'model': 'tts-1-hd', 'input': f'{text}', 'voice': f'{name}', } response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data) # Note: json_data will not be serialized by requests # exactly as it was in the original request. #data = '{\n "model": "tts-1",\n "input": "The quick brown fox jumped over the lazy dog.",\n "voice": "alloy"\n }' #response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data) return "Success", response def elevenlabs(text,name): url = f"https://api.elevenlabs.io/v1/text-to-speech/{name}" CHUNK_SIZE = 1024 #url = "https://api.elevenlabs.io/v1/text-to-speech/" headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe" } data = { "text": f"{text}", "model_id": "eleven_monolingual_v1", "voice_settings": { "stability": 0.5, "similarity_boost": 0.5 } } response = requests.post(url, json=data, headers=headers) # with open('output.mp3', 'wb') as f: # for chunk in response.iter_content(chunk_size=CHUNK_SIZE): # if chunk: # f.write(chunk) return "Success", response microsoft_model_list = [ "en-US-AvaMultilingualNeural" ] def microsoft(text, name, style="Neural"): """ :param text: :param name: :param style: :return: """ headers = { 'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628', 'Content-Type': 'application/ssml+xml', 'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', 'User-Agent': 'curl', } data = ("" f"" # xml:gender='Female' f"{text}" "" "") response = requests.post( 'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1', headers=headers, data=data, ) # data = { # "text":text, # "name":name, # "style":style, # "format":"mp3"} # audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url'] return "Success", response # def google(text,name): # # import subprocess # # command1 = subprocess.run('gcloud auth print-access-token', shell=True, capture_output=True, text=True).stdout # headers = { # 'Authorization': 'Bearer ' + "synclub-2383kjhjksxfv.2341gs", # 'x-goog-user-project': 'PROJECT_ID', # 'Content-Type': 'application/json; charset=utf-8', # } # data = { # "input": { # "text": f"{text}"}, # "voice": { # "languageCode": "en-gb", # "name": "en-GB-Standard-A", # "ssmlGender": "FEMALE" # }, # "audioConfig": { # "audioEncoding": "MP3" # } # } # response = requests.post('https://texttospeech.googleapis.com/v1/text:synthesize', headers=headers, data=data) # return "Success", response if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--device', type=str, default='cuda') parser.add_argument("--share", action="store_true", default=True, help="share gradio app") parser.add_argument("--port", type=int, default=8081, help="port") parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json') args = parser.parse_args() # app = gr.Blocks() # with app: # gr.Markdown("## Japanese TTS Demo") # with gr.Tabs(): # with gr.TabItem("微软"): # tts_input1 = gr.TextArea(label="Text", value=all_example) # tts_input2 = gr.Dropdown(microsoft_model_list, label="name") # tts_submit = gr.Button("Generate", variant="primary") # tts_output1 = gr.Textbox(label="Output Message") # tts_output2 = gr.Audio(label="Output Audio") # tts_submit.click(microsoft, [tts_input1, tts_input2], # [tts_output1, tts_output2]) # _, audio = microsoft(all_example, 'en-US-AvaMultilingualNeural') # _, audio = google(all_example,'alloy') # print(audio) # with open("test4.mp3", "wb") as f: # f.write(audio.content) #_, audio = elevenlabs(all_example, "21m00Tcm4TlvDq8ikWAM") # print(audio) # with open('output.mp3', 'wb') as f: # for chunk in audio.iter_content(chunk_size=1024): # if chunk: # f.write(chunk) # device = torch.device(args.device) # models_tts = [] # with open(args.model_info_path, "r", encoding="utf-8") as f: # models_info = json.load(f) # for i, info in models_info.items(): # model_name = info["model_name"] # author = info["author"] # lang = info["lang"] # example = info["example"] # config_path = info["config_path"] # model_path = info["model_path"] # model_type = info["model_type"] # hps = utils.get_hparams_from_file(config_path) # if model_type == "vits": # emotion_type = None # elif model_type == "vits-emotion": # emotion_type = "embedding" # elif model_type == "vits-emotion-logits": # emotion_type = "logits" # model = SynthesizerTrn( # len(symbols), # hps.data.filter_length // 2 + 1, # hps.train.segment_size // hps.data.hop_length, # emotion_type=emotion_type, # **hps.model) # utils.load_checkpoint(model_path, model, None) # model.eval().to(device) # if model_type == "vits": # # 普通TTS # models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps))) app = gr.Blocks() with app: gr.Markdown("## Japanese TTS Demo") with gr.Tabs(): # with gr.TabItem("自研"): # with gr.Tabs(): # for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts): # with gr.TabItem(model_name): # with gr.Column(): # tts_input1 = gr.TextArea(label="Text", value=example) # tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1) # tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1) # tts_input4 = gr.Slider(label="noise_scale_w", value=0.0, # minimum=0.0, maximum=2, step=0.1) # tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1) # tts_submit = gr.Button("Generate", variant="primary") # tts_output1 = gr.Textbox(label="Output Message") # tts_output2 = gr.Audio(label="Output Audio") # tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5], # [tts_output1, tts_output2]) # with gr.TabItem("谷歌"): # tts_input1 = gr.TextArea(label="Text", value=all_example) # tts_input2 = gr.Dropdown(google_model_list, label="name") # tts_submit = gr.Button("Generate", variant="primary") # tts_output1 = gr.Textbox(label="Output Message") # tts_output2 = gr.Audio(label="Output Audio") # tts_submit.click(google, [tts_input1, tts_input2], # [tts_output1, tts_output2]) with gr.TabItem("微软"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(microsoft_model_list, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(microsoft, [tts_input1, tts_input2], [tts_output1, tts_output2]) # with gr.TabItem("coefont"): # tts_input1 = gr.TextArea(label="Text", value=all_example) # tts_input2 = gr.Dropdown(coefont_model_list, label="name") # tts_submit = gr.Button("Generate", variant="primary") # tts_output1 = gr.Textbox(label="Output Message") # tts_output2 = gr.Audio(label="Output Audio") # tts_submit.click(coefont, [tts_input1, tts_input2], # [tts_output1, tts_output2]) app.launch(show_api=False, share=args.share, server_name='0.0.0.0', server_port=args.port, show_error=True)