gradio / test.py
Mira1sen's picture
Upload folder using huggingface_hub
e569c5f verified
raw
history blame
12.2 kB
import warnings
warnings.filterwarnings("ignore")
# 外部库
import re
import requests
import argparse
import json
import os
import re
import tempfile
import librosa
import numpy as np
# import torch
# from torch import no_grad, LongTensor
# import commons
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils
# 内部库
# from models import SynthesizerTrn
# from text import text_to_sequence, text_to_sequence_for_test, _clean_text
# from mel_processing import spectrogram_torch
# import utils
# from text.symbols import symbols
all_example = "my voice is my passport verify me."
eleven_voice_id = [
"21m00Tcm4TlvDq8ikWAM",
"29vD33N1CtxCmqQRPOHJ",
"2EiwWnXFnvU5JabPnv8n",
"5Q0t7uMcjvnagumLfvZi",
"AZnzlk1XvdvUeBnXmlld",
"CYw3kZ02Hs0563khs1Fj",
"D38z5RcWu1voky8WS1ja",
"EXAVITQu4vr4xnSDxMaL",
"ErXwobaYiN019PkySvjV",
"GBv7mTt0atIp3Br8iCZE",
"IKne3meq5aSn9XLyUdCD",
"JBFqnCBsd6RMkjVDRZzb",
"LcfcDJNUP1GQjkzn1xUU",
"MF3mGyEYCl7XYWbV9V6O",
"N2lVS1w4EtoT3dr4eOWO",
"ODq5zmih8GrVes37Dizd",
"SOYHLrjzK2X1ezoPC6cr",
"TX3LPaxmHKxFdv7VOQHJ",
"ThT5KcBeYPX3keUQqHPh",
"TxGEqnHWrfWFTfGW9XjX",
"VR6AewLTigWG4xSOukaG",
"XB0fDUnXU5powFXDhCwa",
"Xb7hH8MSUJpSbSDYk0k2",
"XrExE9yKIg1WjnnlVkGX",
"ZQe5CZNOzWyzPSCn5a3c",
"Zlb1dXrM653N07WRdFW3",
"bVMeCyTHy58xNoL34h3p",
"flq6f7yk4E4fJM5XTYuZ",
"g5CIjZEefAph4nQFvHAz",
"iP95p4xoKVk53GoZ742B",
"jBpfuIE2acCO8z3wKNLl",
"jsCqWAovK2LkecY7zXl4",
"nPczCjzI2devNBz1zQrb",
"oWAxZDx7w5VEj9dCyTzz",
"onwK4e9ZLuTAKqWW03F9",
"pFZP5JQG7iQjIQuC4Bku",
"pMsXgVXv3BLzUgSXRplE",
"pNInz6obpgDQGcFmaJgB",
"piTKgcLEGmPE4e6mEKli",
"pqHfZKP75CvOlQylNhV4",
"t0jbNlBVZ17f02VDIeMI",
"yoZ06aMxZJJ28mfd3POQ",
"z9fAnlkpzviPz146aGWa",
"zcAOhNBS3c14rBihAFp1",
"zrHiDhphv9ZnVXBqCLjz",
]
eleven_name = [
"Rachel",
"Drew",
"Clyde",
"Paul",
"Domi",
"Dave",
"Fin",
"Sarah",
"Antoni",
"Thomas",
"Charlie",
"George",
"Emily",
"Elli",
"Callum",
"Patrick",
"Harry",
"Liam",
"Dorothy",
"Josh",
"Arnold",
"Charlotte",
"Alice",
"Matilda",
"James",
"Joseph",
"Jeremy",
"Michael",
"Ethan",
"Chris",
"Gigi",
"Freya",
"Brian",
"Grace",
"Daniel",
"Lily",
"Serena",
"Adam",
"Nicole",
"Bill",
"Jessie",
"Sam",
"Glinda",
"Giovanni",
"Mimi",
]
eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id))
def openai(text, name):
headers = {
'Authorization': 'Bearer ' + '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',
'Content-Type': 'application/json',
}
json_data = {
'model': 'tts-1-hd',
'input': f'{text}',
'voice': f'{name}',
}
response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data)
# Note: json_data will not be serialized by requests
# exactly as it was in the original request.
#data = '{\n "model": "tts-1",\n "input": "The quick brown fox jumped over the lazy dog.",\n "voice": "alloy"\n }'
#response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data)
return "Success", response
def elevenlabs(text,name):
url = f"https://api.elevenlabs.io/v1/text-to-speech/{name}"
CHUNK_SIZE = 1024
#url = "https://api.elevenlabs.io/v1/text-to-speech/<voice-id>"
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe"
}
data = {
"text": f"{text}",
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5
}
}
response = requests.post(url, json=data, headers=headers)
# with open('output.mp3', 'wb') as f:
# for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
# if chunk:
# f.write(chunk)
return "Success", response
microsoft_model_list = [
"en-US-AvaMultilingualNeural"
]
def microsoft(text, name, style="Neural"):
"""
:param text:
:param name:
:param style:
:return:
"""
headers = {
'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
'Content-Type': 'application/ssml+xml',
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
'User-Agent': 'curl',
}
data = ("<speak version='1.0' xml:lang='en-US'>"
f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female'
f"{text}"
"</voice>"
"</speak>")
response = requests.post(
'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
headers=headers,
data=data,
)
# data = {
# "text":text,
# "name":name,
# "style":style,
# "format":"mp3"}
# audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url']
return "Success", response
# def google(text,name):
# # import subprocess
# # command1 = subprocess.run('gcloud auth print-access-token', shell=True, capture_output=True, text=True).stdout
# headers = {
# 'Authorization': 'Bearer ' + "synclub-2383kjhjksxfv.2341gs",
# 'x-goog-user-project': 'PROJECT_ID',
# 'Content-Type': 'application/json; charset=utf-8',
# }
# data = {
# "input": {
# "text": f"{text}"},
# "voice": {
# "languageCode": "en-gb",
# "name": "en-GB-Standard-A",
# "ssmlGender": "FEMALE"
# },
# "audioConfig": {
# "audioEncoding": "MP3"
# }
# }
# response = requests.post('https://texttospeech.googleapis.com/v1/text:synthesize', headers=headers, data=data)
# return "Success", response
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cuda')
parser.add_argument("--share", action="store_true", default=True, help="share gradio app")
parser.add_argument("--port", type=int, default=8081, help="port")
parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
args = parser.parse_args()
# app = gr.Blocks()
# with app:
# gr.Markdown("## Japanese TTS Demo")
# with gr.Tabs():
# with gr.TabItem("微软"):
# tts_input1 = gr.TextArea(label="Text", value=all_example)
# tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
# tts_submit = gr.Button("Generate", variant="primary")
# tts_output1 = gr.Textbox(label="Output Message")
# tts_output2 = gr.Audio(label="Output Audio")
# tts_submit.click(microsoft, [tts_input1, tts_input2],
# [tts_output1, tts_output2])
# _, audio = microsoft(all_example, 'en-US-AvaMultilingualNeural')
# _, audio = google(all_example,'alloy')
# print(audio)
# with open("test4.mp3", "wb") as f:
# f.write(audio.content)
#_, audio = elevenlabs(all_example, "21m00Tcm4TlvDq8ikWAM")
# print(audio)
# with open('output.mp3', 'wb') as f:
# for chunk in audio.iter_content(chunk_size=1024):
# if chunk:
# f.write(chunk)
# device = torch.device(args.device)
# models_tts = []
# with open(args.model_info_path, "r", encoding="utf-8") as f:
# models_info = json.load(f)
# for i, info in models_info.items():
# model_name = info["model_name"]
# author = info["author"]
# lang = info["lang"]
# example = info["example"]
# config_path = info["config_path"]
# model_path = info["model_path"]
# model_type = info["model_type"]
# hps = utils.get_hparams_from_file(config_path)
# if model_type == "vits":
# emotion_type = None
# elif model_type == "vits-emotion":
# emotion_type = "embedding"
# elif model_type == "vits-emotion-logits":
# emotion_type = "logits"
# model = SynthesizerTrn(
# len(symbols),
# hps.data.filter_length // 2 + 1,
# hps.train.segment_size // hps.data.hop_length,
# emotion_type=emotion_type,
# **hps.model)
# utils.load_checkpoint(model_path, model, None)
# model.eval().to(device)
# if model_type == "vits":
# # 普通TTS
# models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps)))
app = gr.Blocks()
with app:
gr.Markdown("## Japanese TTS Demo")
with gr.Tabs():
# with gr.TabItem("自研"):
# with gr.Tabs():
# for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts):
# with gr.TabItem(model_name):
# with gr.Column():
# tts_input1 = gr.TextArea(label="Text", value=example)
# tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1)
# tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1)
# tts_input4 = gr.Slider(label="noise_scale_w", value=0.0,
# minimum=0.0, maximum=2, step=0.1)
# tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1)
# tts_submit = gr.Button("Generate", variant="primary")
# tts_output1 = gr.Textbox(label="Output Message")
# tts_output2 = gr.Audio(label="Output Audio")
# tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5],
# [tts_output1, tts_output2])
# with gr.TabItem("谷歌"):
# tts_input1 = gr.TextArea(label="Text", value=all_example)
# tts_input2 = gr.Dropdown(google_model_list, label="name")
# tts_submit = gr.Button("Generate", variant="primary")
# tts_output1 = gr.Textbox(label="Output Message")
# tts_output2 = gr.Audio(label="Output Audio")
# tts_submit.click(google, [tts_input1, tts_input2],
# [tts_output1, tts_output2])
with gr.TabItem("微软"):
tts_input1 = gr.TextArea(label="Text", value=all_example)
tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
tts_submit = gr.Button("Generate", variant="primary")
tts_output1 = gr.Textbox(label="Output Message")
tts_output2 = gr.Audio(label="Output Audio")
tts_submit.click(microsoft, [tts_input1, tts_input2],
[tts_output1, tts_output2])
# with gr.TabItem("coefont"):
# tts_input1 = gr.TextArea(label="Text", value=all_example)
# tts_input2 = gr.Dropdown(coefont_model_list, label="name")
# tts_submit = gr.Button("Generate", variant="primary")
# tts_output1 = gr.Textbox(label="Output Message")
# tts_output2 = gr.Audio(label="Output Audio")
# tts_submit.click(coefont, [tts_input1, tts_input2],
# [tts_output1, tts_output2])
app.launch(show_api=False,
share=args.share,
server_name='0.0.0.0',
server_port=args.port,
show_error=True)