import warnings | |
warnings.filterwarnings("ignore") | |
# 外部库 | |
import re | |
import requests | |
import argparse | |
import json | |
import os | |
import re | |
import tempfile | |
import librosa | |
import numpy as np | |
# import torch | |
# from torch import no_grad, LongTensor | |
# import commons | |
import gradio as gr | |
import gradio.utils as gr_utils | |
import gradio.processing_utils as gr_processing_utils | |
# 内部库 | |
# from models import SynthesizerTrn | |
# from text import text_to_sequence, text_to_sequence_for_test, _clean_text | |
# from mel_processing import spectrogram_torch | |
# import utils | |
# from text.symbols import symbols | |
all_example = "my voice is my passport verify me." | |
eleven_voice_id = [ | |
"21m00Tcm4TlvDq8ikWAM", | |
"29vD33N1CtxCmqQRPOHJ", | |
"2EiwWnXFnvU5JabPnv8n", | |
"5Q0t7uMcjvnagumLfvZi", | |
"AZnzlk1XvdvUeBnXmlld", | |
"CYw3kZ02Hs0563khs1Fj", | |
"D38z5RcWu1voky8WS1ja", | |
"EXAVITQu4vr4xnSDxMaL", | |
"ErXwobaYiN019PkySvjV", | |
"GBv7mTt0atIp3Br8iCZE", | |
"IKne3meq5aSn9XLyUdCD", | |
"JBFqnCBsd6RMkjVDRZzb", | |
"LcfcDJNUP1GQjkzn1xUU", | |
"MF3mGyEYCl7XYWbV9V6O", | |
"N2lVS1w4EtoT3dr4eOWO", | |
"ODq5zmih8GrVes37Dizd", | |
"SOYHLrjzK2X1ezoPC6cr", | |
"TX3LPaxmHKxFdv7VOQHJ", | |
"ThT5KcBeYPX3keUQqHPh", | |
"TxGEqnHWrfWFTfGW9XjX", | |
"VR6AewLTigWG4xSOukaG", | |
"XB0fDUnXU5powFXDhCwa", | |
"Xb7hH8MSUJpSbSDYk0k2", | |
"XrExE9yKIg1WjnnlVkGX", | |
"ZQe5CZNOzWyzPSCn5a3c", | |
"Zlb1dXrM653N07WRdFW3", | |
"bVMeCyTHy58xNoL34h3p", | |
"flq6f7yk4E4fJM5XTYuZ", | |
"g5CIjZEefAph4nQFvHAz", | |
"iP95p4xoKVk53GoZ742B", | |
"jBpfuIE2acCO8z3wKNLl", | |
"jsCqWAovK2LkecY7zXl4", | |
"nPczCjzI2devNBz1zQrb", | |
"oWAxZDx7w5VEj9dCyTzz", | |
"onwK4e9ZLuTAKqWW03F9", | |
"pFZP5JQG7iQjIQuC4Bku", | |
"pMsXgVXv3BLzUgSXRplE", | |
"pNInz6obpgDQGcFmaJgB", | |
"piTKgcLEGmPE4e6mEKli", | |
"pqHfZKP75CvOlQylNhV4", | |
"t0jbNlBVZ17f02VDIeMI", | |
"yoZ06aMxZJJ28mfd3POQ", | |
"z9fAnlkpzviPz146aGWa", | |
"zcAOhNBS3c14rBihAFp1", | |
"zrHiDhphv9ZnVXBqCLjz", | |
] | |
eleven_name = [ | |
"Rachel", | |
"Drew", | |
"Clyde", | |
"Paul", | |
"Domi", | |
"Dave", | |
"Fin", | |
"Sarah", | |
"Antoni", | |
"Thomas", | |
"Charlie", | |
"George", | |
"Emily", | |
"Elli", | |
"Callum", | |
"Patrick", | |
"Harry", | |
"Liam", | |
"Dorothy", | |
"Josh", | |
"Arnold", | |
"Charlotte", | |
"Alice", | |
"Matilda", | |
"James", | |
"Joseph", | |
"Jeremy", | |
"Michael", | |
"Ethan", | |
"Chris", | |
"Gigi", | |
"Freya", | |
"Brian", | |
"Grace", | |
"Daniel", | |
"Lily", | |
"Serena", | |
"Adam", | |
"Nicole", | |
"Bill", | |
"Jessie", | |
"Sam", | |
"Glinda", | |
"Giovanni", | |
"Mimi", | |
] | |
eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id)) | |
def openai(text, name): | |
headers = { | |
'Authorization': 'Bearer ' + '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', | |
'Content-Type': 'application/json', | |
} | |
json_data = { | |
'model': 'tts-1-hd', | |
'input': f'{text}', | |
'voice': f'{name}', | |
} | |
response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data) | |
# Note: json_data will not be serialized by requests | |
# exactly as it was in the original request. | |
#data = '{\n "model": "tts-1",\n "input": "The quick brown fox jumped over the lazy dog.",\n "voice": "alloy"\n }' | |
#response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data) | |
return "Success", response | |
def elevenlabs(text,name): | |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{name}" | |
CHUNK_SIZE = 1024 | |
#url = "https://api.elevenlabs.io/v1/text-to-speech/<voice-id>" | |
headers = { | |
"Accept": "audio/mpeg", | |
"Content-Type": "application/json", | |
"xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe" | |
} | |
data = { | |
"text": f"{text}", | |
"model_id": "eleven_monolingual_v1", | |
"voice_settings": { | |
"stability": 0.5, | |
"similarity_boost": 0.5 | |
} | |
} | |
response = requests.post(url, json=data, headers=headers) | |
# with open('output.mp3', 'wb') as f: | |
# for chunk in response.iter_content(chunk_size=CHUNK_SIZE): | |
# if chunk: | |
# f.write(chunk) | |
return "Success", response | |
microsoft_model_list = [ | |
"en-US-AvaMultilingualNeural" | |
] | |
def microsoft(text, name, style="Neural"): | |
""" | |
:param text: | |
:param name: | |
:param style: | |
:return: | |
""" | |
headers = { | |
'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628', | |
'Content-Type': 'application/ssml+xml', | |
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', | |
'User-Agent': 'curl', | |
} | |
data = ("<speak version='1.0' xml:lang='en-US'>" | |
f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female' | |
f"{text}" | |
"</voice>" | |
"</speak>") | |
response = requests.post( | |
'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1', | |
headers=headers, | |
data=data, | |
) | |
# data = { | |
# "text":text, | |
# "name":name, | |
# "style":style, | |
# "format":"mp3"} | |
# audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url'] | |
return "Success", response | |
# def google(text,name): | |
# # import subprocess | |
# # command1 = subprocess.run('gcloud auth print-access-token', shell=True, capture_output=True, text=True).stdout | |
# headers = { | |
# 'Authorization': 'Bearer ' + "synclub-2383kjhjksxfv.2341gs", | |
# 'x-goog-user-project': 'PROJECT_ID', | |
# 'Content-Type': 'application/json; charset=utf-8', | |
# } | |
# data = { | |
# "input": { | |
# "text": f"{text}"}, | |
# "voice": { | |
# "languageCode": "en-gb", | |
# "name": "en-GB-Standard-A", | |
# "ssmlGender": "FEMALE" | |
# }, | |
# "audioConfig": { | |
# "audioEncoding": "MP3" | |
# } | |
# } | |
# response = requests.post('https://texttospeech.googleapis.com/v1/text:synthesize', headers=headers, data=data) | |
# return "Success", response | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--device', type=str, default='cuda') | |
parser.add_argument("--share", action="store_true", default=True, help="share gradio app") | |
parser.add_argument("--port", type=int, default=8081, help="port") | |
parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json') | |
args = parser.parse_args() | |
# app = gr.Blocks() | |
# with app: | |
# gr.Markdown("## Japanese TTS Demo") | |
# with gr.Tabs(): | |
# with gr.TabItem("微软"): | |
# tts_input1 = gr.TextArea(label="Text", value=all_example) | |
# tts_input2 = gr.Dropdown(microsoft_model_list, label="name") | |
# tts_submit = gr.Button("Generate", variant="primary") | |
# tts_output1 = gr.Textbox(label="Output Message") | |
# tts_output2 = gr.Audio(label="Output Audio") | |
# tts_submit.click(microsoft, [tts_input1, tts_input2], | |
# [tts_output1, tts_output2]) | |
# _, audio = microsoft(all_example, 'en-US-AvaMultilingualNeural') | |
# _, audio = google(all_example,'alloy') | |
# print(audio) | |
# with open("test4.mp3", "wb") as f: | |
# f.write(audio.content) | |
#_, audio = elevenlabs(all_example, "21m00Tcm4TlvDq8ikWAM") | |
# print(audio) | |
# with open('output.mp3', 'wb') as f: | |
# for chunk in audio.iter_content(chunk_size=1024): | |
# if chunk: | |
# f.write(chunk) | |
# device = torch.device(args.device) | |
# models_tts = [] | |
# with open(args.model_info_path, "r", encoding="utf-8") as f: | |
# models_info = json.load(f) | |
# for i, info in models_info.items(): | |
# model_name = info["model_name"] | |
# author = info["author"] | |
# lang = info["lang"] | |
# example = info["example"] | |
# config_path = info["config_path"] | |
# model_path = info["model_path"] | |
# model_type = info["model_type"] | |
# hps = utils.get_hparams_from_file(config_path) | |
# if model_type == "vits": | |
# emotion_type = None | |
# elif model_type == "vits-emotion": | |
# emotion_type = "embedding" | |
# elif model_type == "vits-emotion-logits": | |
# emotion_type = "logits" | |
# model = SynthesizerTrn( | |
# len(symbols), | |
# hps.data.filter_length // 2 + 1, | |
# hps.train.segment_size // hps.data.hop_length, | |
# emotion_type=emotion_type, | |
# **hps.model) | |
# utils.load_checkpoint(model_path, model, None) | |
# model.eval().to(device) | |
# if model_type == "vits": | |
# # 普通TTS | |
# models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps))) | |
app = gr.Blocks() | |
with app: | |
gr.Markdown("## Japanese TTS Demo") | |
with gr.Tabs(): | |
# with gr.TabItem("自研"): | |
# with gr.Tabs(): | |
# for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts): | |
# with gr.TabItem(model_name): | |
# with gr.Column(): | |
# tts_input1 = gr.TextArea(label="Text", value=example) | |
# tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1) | |
# tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1) | |
# tts_input4 = gr.Slider(label="noise_scale_w", value=0.0, | |
# minimum=0.0, maximum=2, step=0.1) | |
# tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1) | |
# tts_submit = gr.Button("Generate", variant="primary") | |
# tts_output1 = gr.Textbox(label="Output Message") | |
# tts_output2 = gr.Audio(label="Output Audio") | |
# tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5], | |
# [tts_output1, tts_output2]) | |
# with gr.TabItem("谷歌"): | |
# tts_input1 = gr.TextArea(label="Text", value=all_example) | |
# tts_input2 = gr.Dropdown(google_model_list, label="name") | |
# tts_submit = gr.Button("Generate", variant="primary") | |
# tts_output1 = gr.Textbox(label="Output Message") | |
# tts_output2 = gr.Audio(label="Output Audio") | |
# tts_submit.click(google, [tts_input1, tts_input2], | |
# [tts_output1, tts_output2]) | |
with gr.TabItem("微软"): | |
tts_input1 = gr.TextArea(label="Text", value=all_example) | |
tts_input2 = gr.Dropdown(microsoft_model_list, label="name") | |
tts_submit = gr.Button("Generate", variant="primary") | |
tts_output1 = gr.Textbox(label="Output Message") | |
tts_output2 = gr.Audio(label="Output Audio") | |
tts_submit.click(microsoft, [tts_input1, tts_input2], | |
[tts_output1, tts_output2]) | |
# with gr.TabItem("coefont"): | |
# tts_input1 = gr.TextArea(label="Text", value=all_example) | |
# tts_input2 = gr.Dropdown(coefont_model_list, label="name") | |
# tts_submit = gr.Button("Generate", variant="primary") | |
# tts_output1 = gr.Textbox(label="Output Message") | |
# tts_output2 = gr.Audio(label="Output Audio") | |
# tts_submit.click(coefont, [tts_input1, tts_input2], | |
# [tts_output1, tts_output2]) | |
app.launch(show_api=False, | |
share=args.share, | |
server_name='0.0.0.0', | |
server_port=args.port, | |
show_error=True) | |