Upload folder using huggingface_hub
Browse files- 1.wav +0 -0
- 11LabsTTS.py +168 -0
- README.md +2 -8
- api.json +12 -0
- ensemble_app.py +439 -0
- gradio_demo.py +13 -0
- guge_api.py +32 -0
- install.sh +17 -0
- microsoftTTS.py +31 -0
- output.mp3 +0 -0
- test.mp3 +0 -0
- test.py +357 -0
- test2.mp3 +1 -0
- test3.mp3 +0 -0
- test4.mp3 +17 -0
- test9.mp3 +17 -0
- test99.mp3 +0 -0
- try.py +33 -0
- try2.py +0 -0
- tts_gradio.py +255 -0
1.wav
ADDED
Binary file (46.1 kB). View file
|
|
11LabsTTS.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
import requests
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
CHUNK_SIZE = 1024
|
6 |
+
|
7 |
+
headers1 = {
|
8 |
+
"Accept": "application/json",
|
9 |
+
"xi-api-key": "54f884da3108725f26af02d5907d1eb4"
|
10 |
+
}
|
11 |
+
|
12 |
+
headers2 = {
|
13 |
+
"Accept": "audio/mpeg",
|
14 |
+
"Content-Type": "application/json",
|
15 |
+
"xi-api-key": "54f884da3108725f26af02d5907d1eb4"
|
16 |
+
}
|
17 |
+
|
18 |
+
name_list = []
|
19 |
+
|
20 |
+
def elevenlabs_tts(text, voice_name):
|
21 |
+
# 获取声音列表
|
22 |
+
url1 = "https://api.elevenlabs.io/v1/voices"
|
23 |
+
response1 = requests.get(url1, headers=headers1)
|
24 |
+
voices = response1.json()['voices']
|
25 |
+
|
26 |
+
for voice in voices:
|
27 |
+
vid = voice['voice_id']
|
28 |
+
vname = voice['name']
|
29 |
+
label = voice['labels']
|
30 |
+
info = {"voice_id": vid, "name": vname, "labels": label}
|
31 |
+
|
32 |
+
name_list.append(info["name"])
|
33 |
+
|
34 |
+
if info['name'] == voice_name:
|
35 |
+
voice_id = info['voice_id']
|
36 |
+
url2 = "https://api.elevenlabs.io/v1/text-to-speech/" + voice_id
|
37 |
+
# print(infos)
|
38 |
+
# return infos
|
39 |
+
|
40 |
+
# 根据指定的人名合成语音
|
41 |
+
data = {
|
42 |
+
"text": text,
|
43 |
+
"model_id": "eleven_monolingual_v1",
|
44 |
+
"voice_settings": {
|
45 |
+
"stability": 0.5,
|
46 |
+
"similarity_boost": 0.5
|
47 |
+
}
|
48 |
+
}
|
49 |
+
|
50 |
+
response2 = requests.post(url2, json=data, headers=headers2)
|
51 |
+
with open('output.mp3', 'wb') as f:
|
52 |
+
for chunk in response2.iter_content(chunk_size=CHUNK_SIZE):
|
53 |
+
if chunk:
|
54 |
+
f.write(chunk)
|
55 |
+
|
56 |
+
return 'output.mp3'
|
57 |
+
|
58 |
+
demo = gr.Interface(
|
59 |
+
fn = elevenlabs_tts,
|
60 |
+
|
61 |
+
# demo输入设置
|
62 |
+
inputs = [
|
63 |
+
gr.Dropdown(name_list, label="发音人"),
|
64 |
+
gr.Textbox(label="输入文本"),
|
65 |
+
],
|
66 |
+
# demo输出设置
|
67 |
+
outputs = [
|
68 |
+
"audio",
|
69 |
+
"text",
|
70 |
+
],
|
71 |
+
# demo其他设置
|
72 |
+
title = "Text to Speech Synthesis",
|
73 |
+
description = "Synthesize speech from text using Elevenlabs",
|
74 |
+
examples = [
|
75 |
+
["Rachel", "Hello World!"],
|
76 |
+
["Clyde", "This is a test."],
|
77 |
+
["Domi", "Gradio is awesome!"],
|
78 |
+
]
|
79 |
+
)
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
demo.launch(share=True, server_name='0.0.0.0', server_port=8121)
|
83 |
+
#print(name_list)
|
84 |
+
|
85 |
+
"""
|
86 |
+
|
87 |
+
import requests
|
88 |
+
import gradio as gr
|
89 |
+
|
90 |
+
CHUNK_SIZE = 1024
|
91 |
+
|
92 |
+
headers1 = {
|
93 |
+
"Accept": "application/json",
|
94 |
+
"xi-api-key": "54f884da3108725f26af02d5907d1eb4"
|
95 |
+
}
|
96 |
+
|
97 |
+
headers2 = {
|
98 |
+
"Accept": "audio/mpeg",
|
99 |
+
"Content-Type": "application/json",
|
100 |
+
"xi-api-key": "54f884da3108725f26af02d5907d1eb4"
|
101 |
+
}
|
102 |
+
|
103 |
+
def get_voice_names():
|
104 |
+
url1 = "https://api.elevenlabs.io/v1/voices"
|
105 |
+
response1 = requests.get(url1, headers=headers1)
|
106 |
+
voices = response1.json()['voices']
|
107 |
+
names = [voice['name'] for voice in voices]
|
108 |
+
return names
|
109 |
+
|
110 |
+
name_list = get_voice_names()
|
111 |
+
|
112 |
+
def elevenlabs_tts(voice_name, text):
|
113 |
+
# 获取声音列表
|
114 |
+
url1 = "https://api.elevenlabs.io/v1/voices"
|
115 |
+
response1 = requests.get(url1, headers=headers1)
|
116 |
+
voices = response1.json()['voices']
|
117 |
+
#print(voice_name)
|
118 |
+
#print(voices)
|
119 |
+
|
120 |
+
for voice in voices:
|
121 |
+
if voice['name'] == voice_name:
|
122 |
+
voice_id = voice['voice_id']
|
123 |
+
label = voice['labels']
|
124 |
+
url2 = "https://api.elevenlabs.io/v1/text-to-speech/" + voice_id
|
125 |
+
#print(voice_id)
|
126 |
+
#print(label)
|
127 |
+
break
|
128 |
+
|
129 |
+
data = {
|
130 |
+
"text": text,
|
131 |
+
"model_id": "eleven_monolingual_v1",
|
132 |
+
"voice_settings": {
|
133 |
+
"stability": 0.5,
|
134 |
+
"similarity_boost": 0.5
|
135 |
+
}
|
136 |
+
}
|
137 |
+
|
138 |
+
response2 = requests.post(url2, json=data, headers=headers2)
|
139 |
+
with open('output.mp3', 'wb') as f:
|
140 |
+
for chunk in response2.iter_content(chunk_size=CHUNK_SIZE):
|
141 |
+
if chunk:
|
142 |
+
f.write(chunk)
|
143 |
+
|
144 |
+
return 'output.mp3', label
|
145 |
+
|
146 |
+
demo = gr.Interface(
|
147 |
+
fn = elevenlabs_tts,
|
148 |
+
inputs = [
|
149 |
+
gr.Dropdown(name_list, label="发音人"),
|
150 |
+
gr.Textbox(label="输入文本"),
|
151 |
+
],
|
152 |
+
outputs = [
|
153 |
+
"audio",
|
154 |
+
"text",
|
155 |
+
],
|
156 |
+
title = "Text to Speech Synthesis",
|
157 |
+
description = "Synthesize speech from text using Elevenlabs",
|
158 |
+
examples = [
|
159 |
+
["Rachel", "Hello World!"],
|
160 |
+
["Clyde", "This is a test."],
|
161 |
+
["Domi", "Gradio is awesome!"],
|
162 |
+
]
|
163 |
+
)
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
demo.launch(share=True, server_name='0.0.0.0', server_port=8121)
|
167 |
+
|
168 |
+
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.36.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: gradio
|
3 |
+
app_file: gradio_demo.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.36.1
|
|
|
|
|
6 |
---
|
|
|
|
api.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "service_account",
|
3 |
+
"project_id": "gravity-362603",
|
4 |
+
"private_key_id": "aa2ae4651df33c1aab0670e07e265992f943c292",
|
5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQC9VMCci15hlqhR\nv8ev+K1HNq5c309RsE0yxhRec1QsB1RYShIU0vkukbOXn5pqF0lRHasvtjpAuEVX\nXsUWrParLUdJJBsttAD9WwLBfek1cVfvTktHLyttM92V1R6ueWI0rHa0TQpVc/m0\nDjhuwCDDShRxMOdkmJ98ZT7Rpl9udw3O5tEyYkqkN8AFAtCwKsglxUFgEgHg60qr\nyAtdHm+K/wi4d1xlzvsmsr11227PFz5PYVAdpNMO5/JOopxTUp19KD8xo/pW1p1f\neDsJZc8dvwcwxei0qQ9FiFXm9rhSVkccMjsJf9PSOIRj9Q0q3Pe5HcaKB8kGRxBJ\nYQY+dM0hAgMBAAECggEAIskUjm6Iy7NLf6GVxb4R0Bi61nNs9lxHAsIj8eqMl35N\nP990DQFqKOU6MuxvCXj3zoB9+Ekp0MRKcTAHVBA5TeAGzehlWfNNjz8OdZB1rY0C\nlAQ1joGbH/g8iQ/cEEBbC8R4Fs5kzKh/Q1athaOlIVxV/yn2CaSFAOz1Kr2/oOLL\ntujyykgL69nkT4Namojh69GNIztLEQBjnrjSLXBJXYRov0FEVzA0DuI9ubV+upmS\nHEzgpHsJQCDVD/OryPH1KVN3Y1DCTdfSU3SUzOTiPv80v/H0GsZvfi/8M7rvNwgs\nZ4t+5Vs3zhOQXujT66dL7rzum2zyRcez1c6qU707tQKBgQDtC8e/au+jCKpjszuQ\nrMBhXl5BGowD5ODvLa1bB3r0V6C6uyryzXOTSnWcBiWqbv53BwQk+4LLYO/8g4MV\n3zD3EZK0L9l+UIOJbX2aHNf3fxU05zeuuREa35M93MgG+J5jAjhV3piWik8RcGtq\nQpbdNVVsVfATucGA9tH+r9YF4wKBgQDMeEUGsofjzH+PGkYWv6sxwtevBJAVqaVF\nVirT614ngGEgb1ct0lg7jRmiMTLrYwDNc3Wpel8jkfH7WNpDcrZjOv14qeVomdQa\n/3XUMdlxOxcFkJ14J6Jd4W3o/vQvPFlbUO/qH93WtotyLQW3pcQKGc0LK33UaLw1\nQFlSeJ/wKwKBgFgq4YjZBXjmaDndGHKfTo7owrYEYb+xCdjsbGZHSfxH+4h4xWi5\n8hr+vu/heXEDB7LN8USwJ93tmFWbtM4VzeiXqLFMeuD4oXQkPWts/wcXRZP/zD44\np5wPPHzt+ZheMMsysDGAMdEMcIxT/B/x1JtCkxtQe5NarBaDt2e7jydxAoGAMKh+\nZLWfuQo1YOyzuT4rk0/22+OOFlmkxvdWgdAFPAlFE97rNDBWZvWRIBPtNi79Recw\nHQDOGzzkBRiD+IEX/k1PfKORwG67FyGr/K9QO64AMIbVsn5cGVNIQKZLneISsbR1\nI5YpyKrwTF+CeM2t9Wlmbj4PV7kE8Pc7SnECPrkCgYBjzbcl+vTvfw6iy6z7iL+N\nQ+sgsUMoUymZF5xSH/iInzFDlrIZP+vJlC6eW/0Q2TZZqQU9eLUTTP9cB4XXEv45\nbvK1IlMSBeVHA3tSbpp/tNqb/YqDTyEV9AY2MMIAKyPReTUKZoRQMyOzzK03FYOk\nMbI0XZMpHoo60OECydP7Ng==\n-----END PRIVATE KEY-----\n",
|
6 |
+
"client_email": "[email protected]",
|
7 |
+
"client_id": "112160328750047841519",
|
8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gcp-tts-gravity%40gravity-362603.iam.gserviceaccount.com"
|
12 |
+
}
|
ensemble_app.py
ADDED
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding=utf8 -*-
|
3 |
+
########################################################################
|
4 |
+
#
|
5 |
+
# Copyright (c) 2023 Baidu.com, Inc. All Rights Reserved
|
6 |
+
#
|
7 |
+
########################################################################
|
8 |
+
|
9 |
+
"""
|
10 |
+
Author: linxiaolong
|
11 |
+
"""
|
12 |
+
import warnings
|
13 |
+
warnings.filterwarnings("ignore")
|
14 |
+
|
15 |
+
# 外部库
|
16 |
+
import re
|
17 |
+
import requests
|
18 |
+
import argparse
|
19 |
+
import json
|
20 |
+
import os
|
21 |
+
import re
|
22 |
+
import tempfile
|
23 |
+
|
24 |
+
import librosa
|
25 |
+
import numpy as np
|
26 |
+
# import torch
|
27 |
+
# from torch import no_grad, LongTensor
|
28 |
+
import commons
|
29 |
+
import gradio as gr
|
30 |
+
import gradio.utils as gr_utils
|
31 |
+
import gradio.processing_utils as gr_processing_utils
|
32 |
+
|
33 |
+
# 内部库
|
34 |
+
from models import SynthesizerTrn
|
35 |
+
from text import text_to_sequence, text_to_sequence_for_test, _clean_text
|
36 |
+
from mel_processing import spectrogram_torch
|
37 |
+
import utils
|
38 |
+
from text.symbols import symbols
|
39 |
+
|
40 |
+
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
41 |
+
punct_regex = re.compile(r"[\.!\?。!?]")
|
42 |
+
silence_duration = 200
|
43 |
+
|
44 |
+
|
45 |
+
def split_text(text, regex):
|
46 |
+
"""Split text into sentences by puncutations.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
text: long text.
|
50 |
+
regex: puncutation regex.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
list of sentences.
|
54 |
+
"""
|
55 |
+
sentences = re.split(regex, text)
|
56 |
+
puncts = re.findall(regex, text)
|
57 |
+
|
58 |
+
for i, sentence in enumerate(sentences):
|
59 |
+
if sentence == "":
|
60 |
+
continue
|
61 |
+
if i < len(puncts):
|
62 |
+
sentences[i] = sentences[i] + puncts[i]
|
63 |
+
else:
|
64 |
+
sentences[i] = sentences[i] + "。"
|
65 |
+
sentences = [i for i in sentences if i != ""]
|
66 |
+
return sentences
|
67 |
+
|
68 |
+
|
69 |
+
def concat_audio(audio_list, sampling_rate=22050, silence_duration=1000):
|
70 |
+
"""Concatenate audio files and insert silence between them.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
audio_list: list of audio files.
|
74 |
+
sampling_rate: audio sampling rate. Defaults to 22050.
|
75 |
+
silence_duration: silence duration in miliseconds. Defaults to 1000.
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
concatenated audio.
|
79 |
+
"""
|
80 |
+
silence_samples = int(sampling_rate * silence_duration / 1000)
|
81 |
+
silence = np.zeros(silence_samples, dtype=np.float16)
|
82 |
+
|
83 |
+
audio_num = len(audio_list)
|
84 |
+
if audio_num < 2:
|
85 |
+
return audio_list[0]
|
86 |
+
audio_cat = audio_list[0]
|
87 |
+
for i in range(1, audio_num):
|
88 |
+
audio_cat = np.concatenate((audio_cat, silence, audio_list[i]), axis=0)
|
89 |
+
|
90 |
+
return audio_cat
|
91 |
+
|
92 |
+
|
93 |
+
### 外部TTS的超参数
|
94 |
+
microsoft_url = "https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1"
|
95 |
+
microsoft_headers = {'Content-Type': 'application/json; charset=utf-8',
|
96 |
+
'Ocp-Apim-Subscription-Key':'1f1ef0ce53b84261be94fab81df7e628'}
|
97 |
+
microsoft_model_list = [
|
98 |
+
"ja-JP-NanamiNeural",
|
99 |
+
"ja-JP-KeitaNeural",
|
100 |
+
"ja-JP-AoiNeural",
|
101 |
+
"ja-JP-DaichiNeural",
|
102 |
+
"ja-JP-MayuNeural",
|
103 |
+
"ja-JP-NaokiNeural",
|
104 |
+
"ja-JP-ShioriNeural"
|
105 |
+
]
|
106 |
+
|
107 |
+
google_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/voice_gq"
|
108 |
+
google_headers = {'Content-Type': 'application/json; charset=utf-8',
|
109 |
+
'apikey':'synclub-2383kjhjksxfv.2341gs'}
|
110 |
+
google_model_list = [
|
111 |
+
"ja-JP-Neural2-B",
|
112 |
+
"ja-JP-Neural2-C",
|
113 |
+
"ja-JP-Neural2-D",
|
114 |
+
"ja-JP-Standard-A",
|
115 |
+
"ja-JP-Standard-B",
|
116 |
+
"ja-JP-Standard-C",
|
117 |
+
"ja-JP-Standard-D",
|
118 |
+
"ja-JP-Wavenet-A",
|
119 |
+
"ja-JP-Wavenet-B",
|
120 |
+
"ja-JP-Wavenet-C",
|
121 |
+
"ja-JP-Wavenet-D"
|
122 |
+
]
|
123 |
+
|
124 |
+
coefont_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/avatar_coe"
|
125 |
+
coefont_headers = {'Content-Type': 'application/json; charset=utf-8',
|
126 |
+
'apikey':'synclub-2383kjhjksxfv.2341gs'}
|
127 |
+
coefont_id = [
|
128 |
+
'3f84b7b1-30fb-4677-a704-fd136515303e',
|
129 |
+
'9b826785-bea5-4740-b4cd-e9a286264705',
|
130 |
+
'7632cba3-4aca-4cee-9d15-ad1ac31f670c',
|
131 |
+
'2c91238a-96f9-4cb6-a69a-461ee66b0e6d',
|
132 |
+
'08428dee-65b6-490e-a3a3-60dfcdda889d',
|
133 |
+
'c88367bc-5954-426b-a1ba-a683202803c8',
|
134 |
+
'fb64a764-91d5-4510-bddd-70df3d62709a',
|
135 |
+
'5cfa1f33-bca8-4489-bcbe-701045993162',
|
136 |
+
'94cf7792-7c0c-4be4-88e7-c30d26ab6616',
|
137 |
+
'81dbd387-6ad6-4b22-93f9-4e2a0091b2fe',
|
138 |
+
'931a8568-039a-4cef-add7-bee71629c00e',
|
139 |
+
'f91a9d29-c8b4-443f-ba07-82e7e36bd20b',
|
140 |
+
'23c76cf0-bee0-47fa-b735-9b7bdba9f26a',
|
141 |
+
'cf5fdfb8-85ea-41e1-915b-257936791f17',
|
142 |
+
'0f7b53df-3c24-46a5-84d1-cbea39a956c0',
|
143 |
+
'3d499385-d331-4cbb-93c0-2057e60eddcf',
|
144 |
+
'18ca2f7b-97ca-486d-8f47-858965833642',
|
145 |
+
'33e0a2ff-5050-434c-9506-defe97e52f15',
|
146 |
+
'516b0f32-8b5f-48c5-b60e-38d508e2b06b',
|
147 |
+
'c8720caf-2d2d-4130-8831-92f61f9e25e8',
|
148 |
+
'710001f5-e6f5-4cc0-8ba2-e6aa6da8d807',
|
149 |
+
'd36f8bb1-8bd8-4e90-964a-9dbd3e374093',
|
150 |
+
'2157796c-fe48-4688-b7cc-7ea554edf77d',
|
151 |
+
'5cc0dc91-0c6a-4c50-b7d8-f3117cfe44ef',
|
152 |
+
'be5c5295-aba2-4055-a9da-8926da7fb5a0',
|
153 |
+
'76763239-af14-4c0d-9435-956f096f77dc',
|
154 |
+
'10d298ee-ebbf-4838-a6c5-d608f2e3c338',
|
155 |
+
'694cb06e-73bd-43c4-94d4-f775ad3dbb26',
|
156 |
+
'5cf07e7c-5b1c-4360-a8de-7c928580d4b5',
|
157 |
+
'76e2ba06-b23a-4bbe-8148-e30ede9001b9',
|
158 |
+
'c25ed97f-78f7-4e8f-b2fa-f8e29633588b',
|
159 |
+
'e26382ba-2ae2-4cf7-8c1b-420ab4b845d8',
|
160 |
+
'82c4fcf5-d0ee-4fe9-9b0d-89a65d04f290'
|
161 |
+
]
|
162 |
+
coefont_model_list = [
|
163 |
+
'Canel',
|
164 |
+
'胡麻ちゃん',
|
165 |
+
'バーチャル悪霊',
|
166 |
+
'引寄\u3000法則',
|
167 |
+
'にっし~☆',
|
168 |
+
'志水 智(Tomo Shimizu)',
|
169 |
+
'花撫シア-最高精度-しっかり読み上げ',
|
170 |
+
'UNF/UserNotFound',
|
171 |
+
'RoBaKu',
|
172 |
+
'おにもち',
|
173 |
+
'小菅 将太',
|
174 |
+
'秋月つむぎ(落ち着いたナレーション)',
|
175 |
+
'碧海紘斗_OhmiHiroto',
|
176 |
+
'ちくわぶえ',
|
177 |
+
'unnamed',
|
178 |
+
'今井瑶子(高精度。MC ナレーター 落ち着いたトーンです)',
|
179 |
+
'皆のお母さん',
|
180 |
+
'後藤邑子',
|
181 |
+
'田中和彦',
|
182 |
+
'KTNR',
|
183 |
+
'天渡\u3000早苗',
|
184 |
+
'須戸ゼロ',
|
185 |
+
'とり藻々',
|
186 |
+
'武田 祐子',
|
187 |
+
'【PRO】落ち着きナレーション♯畑耕平',
|
188 |
+
'音暖ののん Ver2.0(最高精度)',
|
189 |
+
'ろさちゃん-soft-v2[最高精度] ¦ Losa-chan -soft- ∀ -汎用式概念χ',
|
190 |
+
'パイナップル秀夫お姉さん',
|
191 |
+
'minamo',
|
192 |
+
'あさのゆき',
|
193 |
+
'聲華 琴音【紡】',
|
194 |
+
'黄琴海月【うるとら】',
|
195 |
+
'高橋 俊輔']
|
196 |
+
coefont_id_model_name_dict = dict(zip(coefont_model_list, coefont_id))
|
197 |
+
|
198 |
+
all_example = "今日は天気がいいから、一緒にハイキングに行きましょう。"
|
199 |
+
|
200 |
+
# def audio_postprocess(self, y):
|
201 |
+
# """
|
202 |
+
# 修改gr的音频后处理函数
|
203 |
+
# :param self:
|
204 |
+
# :param y:
|
205 |
+
# :return:
|
206 |
+
# """
|
207 |
+
# if y is None:
|
208 |
+
# return None
|
209 |
+
|
210 |
+
# if gr_utils.validate_url(y):
|
211 |
+
# file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
|
212 |
+
# elif isinstance(y, tuple):
|
213 |
+
# sample_rate, data = y
|
214 |
+
# file = tempfile.NamedTemporaryFile(
|
215 |
+
# suffix=".wav", dir=self.temp_dir, delete=False
|
216 |
+
# )
|
217 |
+
# gr_processing_utils.audio_to_file(sample_rate, data, file.name)
|
218 |
+
# else:
|
219 |
+
# file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)
|
220 |
+
|
221 |
+
# return gr_processing_utils.encode_url_or_file_to_base64(file.name)
|
222 |
+
|
223 |
+
# gr.Audio.postprocess = audio_postprocess
|
224 |
+
|
225 |
+
def get_text(text, hps):
|
226 |
+
"""
|
227 |
+
:param text:
|
228 |
+
:param hps:
|
229 |
+
:param is_symbol:
|
230 |
+
:return:
|
231 |
+
"""
|
232 |
+
# hps中没有包括symbols
|
233 |
+
text_norm = text_to_sequence(text, hps.data.text_cleaners)
|
234 |
+
# hps中有包括symbols
|
235 |
+
# text_norm = text_to_sequence_for_test(text, hps.symbols, hps.data.text_cleaners)
|
236 |
+
if hps.data.add_blank:
|
237 |
+
text_norm = commons.intersperse(text_norm, 0)
|
238 |
+
text_norm = LongTensor(text_norm)
|
239 |
+
return text_norm
|
240 |
+
|
241 |
+
|
242 |
+
def create_tts_fn(model, hps):
|
243 |
+
"""
|
244 |
+
:param model:
|
245 |
+
:param hps:
|
246 |
+
:param speaker_ids:
|
247 |
+
:return:
|
248 |
+
"""
|
249 |
+
def tts_fn(text, speed, noise_scale=.667, noise_scale_w=0.8, volume=1.0):
|
250 |
+
"""
|
251 |
+
:param text:
|
252 |
+
:param speaker:
|
253 |
+
:param speed:
|
254 |
+
:param emo:
|
255 |
+
:param volume:
|
256 |
+
:param is_symbol:
|
257 |
+
:return:
|
258 |
+
"""
|
259 |
+
sentences = split_text(text, punct_regex)
|
260 |
+
audio_list = []
|
261 |
+
for sentence in sentences:
|
262 |
+
stn_tst = get_text(sentence, hps)
|
263 |
+
with no_grad():
|
264 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
265 |
+
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
266 |
+
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
267 |
+
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
268 |
+
audio_list.append(audio)
|
269 |
+
del stn_tst, x_tst, x_tst_lengths
|
270 |
+
audio = concat_audio(audio_list, hps.data.sampling_rate, silence_duration)
|
271 |
+
audio = audio * volume
|
272 |
+
return "Success", (hps.data.sampling_rate, audio)
|
273 |
+
return tts_fn
|
274 |
+
|
275 |
+
|
276 |
+
def microsoft(text, name, style="Neural"):
|
277 |
+
"""
|
278 |
+
:param text:
|
279 |
+
:param name:
|
280 |
+
:param style:
|
281 |
+
:return:
|
282 |
+
"""
|
283 |
+
headers = {
|
284 |
+
'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
|
285 |
+
'Content-Type': 'application/ssml+xml',
|
286 |
+
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
|
287 |
+
'User-Agent': 'curl',
|
288 |
+
}
|
289 |
+
|
290 |
+
data = ("<speak version='1.0' xml:lang='en-US'>"
|
291 |
+
f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female'
|
292 |
+
f"{text}"
|
293 |
+
"</voice>"
|
294 |
+
"</speak>")
|
295 |
+
|
296 |
+
response = requests.post(
|
297 |
+
'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
|
298 |
+
headers=headers,
|
299 |
+
data=data,
|
300 |
+
proxies= {
|
301 |
+
'http': 'http://192.168.3.11:80',
|
302 |
+
'https': 'http://192.168.3.11:80',
|
303 |
+
}
|
304 |
+
)
|
305 |
+
data = {
|
306 |
+
"text":text,
|
307 |
+
"name":name,
|
308 |
+
"style":style,
|
309 |
+
"format":"mp3"}
|
310 |
+
audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url']
|
311 |
+
return "Success", audio_url
|
312 |
+
|
313 |
+
|
314 |
+
def google(text, name):
|
315 |
+
"""
|
316 |
+
:param text:
|
317 |
+
:param name:
|
318 |
+
:param style:
|
319 |
+
:return:
|
320 |
+
"""
|
321 |
+
data = {
|
322 |
+
"text":text,
|
323 |
+
"name":name,
|
324 |
+
"sample_rate":16000}
|
325 |
+
audio_url = requests.get(google_url, headers=google_headers, json=data).json()['data']['url']
|
326 |
+
return "Success", audio_url
|
327 |
+
|
328 |
+
|
329 |
+
def coefont(text, name):
|
330 |
+
"""
|
331 |
+
:param text:
|
332 |
+
:param name:
|
333 |
+
:param style:
|
334 |
+
:return:
|
335 |
+
"""
|
336 |
+
data = {
|
337 |
+
"text":text,
|
338 |
+
"coefont":coefont_id_model_name_dict[name]
|
339 |
+
}
|
340 |
+
audio_url = requests.get(coefont_url, headers=coefont_headers, json=data).json()['data']['url']
|
341 |
+
return "Success", audio_url
|
342 |
+
|
343 |
+
|
344 |
+
if __name__ == '__main__':
|
345 |
+
parser = argparse.ArgumentParser()
|
346 |
+
parser.add_argument('--device', type=str, default='cuda')
|
347 |
+
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
348 |
+
parser.add_argument("--port", type=int, default=8080, help="port")
|
349 |
+
parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
|
350 |
+
args = parser.parse_args()
|
351 |
+
|
352 |
+
device = torch.device(args.device)
|
353 |
+
models_tts = []
|
354 |
+
|
355 |
+
with open(args.model_info_path, "r", encoding="utf-8") as f:
|
356 |
+
models_info = json.load(f)
|
357 |
+
for i, info in models_info.items():
|
358 |
+
model_name = info["model_name"]
|
359 |
+
author = info["author"]
|
360 |
+
lang = info["lang"]
|
361 |
+
example = info["example"]
|
362 |
+
config_path = info["config_path"]
|
363 |
+
model_path = info["model_path"]
|
364 |
+
model_type = info["model_type"]
|
365 |
+
|
366 |
+
hps = utils.get_hparams_from_file(config_path)
|
367 |
+
if model_type == "vits":
|
368 |
+
emotion_type = None
|
369 |
+
elif model_type == "vits-emotion":
|
370 |
+
emotion_type = "embedding"
|
371 |
+
elif model_type == "vits-emotion-logits":
|
372 |
+
emotion_type = "logits"
|
373 |
+
|
374 |
+
model = SynthesizerTrn(
|
375 |
+
len(symbols),
|
376 |
+
hps.data.filter_length // 2 + 1,
|
377 |
+
hps.train.segment_size // hps.data.hop_length,
|
378 |
+
emotion_type=emotion_type,
|
379 |
+
**hps.model)
|
380 |
+
|
381 |
+
utils.load_checkpoint(model_path, model, None)
|
382 |
+
model.eval().to(device)
|
383 |
+
if model_type == "vits":
|
384 |
+
# 普通TTS
|
385 |
+
models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps)))
|
386 |
+
app = gr.Blocks()
|
387 |
+
with app:
|
388 |
+
gr.Markdown("## Japanese TTS Demo")
|
389 |
+
with gr.Tabs():
|
390 |
+
with gr.TabItem("自研"):
|
391 |
+
with gr.Tabs():
|
392 |
+
for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts):
|
393 |
+
with gr.TabItem(model_name):
|
394 |
+
with gr.Column():
|
395 |
+
tts_input1 = gr.TextArea(label="Text", value=example)
|
396 |
+
tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1)
|
397 |
+
tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1)
|
398 |
+
tts_input4 = gr.Slider(label="noise_scale_w", value=0.0,
|
399 |
+
minimum=0.0, maximum=2, step=0.1)
|
400 |
+
tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1)
|
401 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
402 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
403 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
404 |
+
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5],
|
405 |
+
[tts_output1, tts_output2])
|
406 |
+
|
407 |
+
with gr.TabItem("谷歌"):
|
408 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
409 |
+
tts_input2 = gr.Dropdown(google_model_list, label="name")
|
410 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
411 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
412 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
413 |
+
tts_submit.click(google, [tts_input1, tts_input2],
|
414 |
+
[tts_output1, tts_output2])
|
415 |
+
|
416 |
+
with gr.TabItem("微软"):
|
417 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
418 |
+
tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
|
419 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
420 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
421 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
422 |
+
tts_submit.click(microsoft, [tts_input1, tts_input2],
|
423 |
+
[tts_output1, tts_output2])
|
424 |
+
|
425 |
+
with gr.TabItem("coefont"):
|
426 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
427 |
+
tts_input2 = gr.Dropdown(coefont_model_list, label="name")
|
428 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
429 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
430 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
431 |
+
tts_submit.click(coefont, [tts_input1, tts_input2],
|
432 |
+
[tts_output1, tts_output2])
|
433 |
+
|
434 |
+
app.queue(concurrency_count=5).launch(show_api=False,
|
435 |
+
share=args.share,
|
436 |
+
server_name='0.0.0.0',
|
437 |
+
server_port=args.port,
|
438 |
+
show_error=True)
|
439 |
+
|
gradio_demo.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
def greet(name):
|
3 |
+
return "Hello " + name + "!"
|
4 |
+
with gr.Blocks() as demo:
|
5 |
+
# 设置输入组件
|
6 |
+
name = gr.Textbox(label="Name")
|
7 |
+
# 设置输出组件
|
8 |
+
output = gr.Textbox(label="Output Box")
|
9 |
+
# 设置按钮
|
10 |
+
greet_btn = gr.Button("Greet")
|
11 |
+
# 设置按钮点击事件,默认是垂直
|
12 |
+
greet_btn.click(fn=greet, inputs=name, outputs=output)
|
13 |
+
demo.launch(share=True)
|
guge_api.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import base64
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./api.json"
|
6 |
+
|
7 |
+
url = "https://texttospeech.googleapis.com/v1/text:synthesize"
|
8 |
+
headers = {'Content-Type': 'application/json; charset=utf-8','Authorization': 'Bearer ya29.c.b0Aaekm1Iv0qkoIurI3LgzvNRqWE9tZRNusy9HCLT_xcpTXvdNsvYQt_9PUIXDQAvHV-b9abi-_n9Totai8fgJ7qkJ5sJszhU8bKTkP9zPJccfTkU4pCvyJxCVO1APIj7p3Hdvl9LWAIGb6aqFdz0a2dSn2mzsGKSsv0UqjlPr4M4s08HWkW8fqdE8UO98xphUTm0bEYmYFn-feWAPG5QToNi62c_4KZxveCCPhZZOtWLSPCGOj-D9z1YIHZZcdWz8mWQ5lO2WpASEroak4ohRchjSEpnJka8F3uwoY36Oms4M32d9uKYbsdNMFfS1CudzvDYXKSBkH337KZqb6o-zhFUm13Ivxlr1Zbtvv6dw0kbg_my1-p69v9tbXIsgxvgYfjMfhxgxfOX3aJxsgnU-jw0d1w5jgs70InjbZRagcrRth24Sm4wzyma40j5UodiIvI0sRxQOBUzaZ93YXZje1S_vtoxcSUl9VSrQY9rWt2uk2Ili-aIsI6rUa06McJcnSB-jQhq61z3xpoqRXVd04SMZlc1YnWxhZVYakZoRIivMwFkjRvb0SIn7x_1M880QF4IYzMOb7eVkJW2t41u5y2rWSc7rerjetWd2vRehn63vJRp0jn3lskyM5YZqqiWs4VYjqdVt7f9jy3qR2Vgn5V2hvSdrjdbkSgc1UB3fl4k-4sbjrd-M3OpO99R0I921-nMc8lBgIFum9qZ-VcWJJ-ecYZtygSXiiR6ljX6p70h6m90vVbc8bwnk7ez99mUOzhkuR8B8wb1aJd8XyzdYuqwWfMyq7U89dUju-m6dji-XeMuSSwSiueaVhsU68FXiJa_-ieZra1Saak7zjq_feg8hjZRujbh0e_YqRO4Bhf7-5F-UrO1n7XwJIzofR_uxJfgxs6MuFxfUJX87azO1I31Zty5ZSc2Q4mynq0mlQWOvhl7fr8pSJMzfQtOJW6wOc0RrW3ouIB-mWRrie46gbqF0FzSylkipZ3JasoIeO-gS_olF-YfYR5i'
|
9 |
+
}
|
10 |
+
text = "二月の下旬に差し掛かる頃だった。"
|
11 |
+
|
12 |
+
data = {
|
13 |
+
"input":{
|
14 |
+
"text":text
|
15 |
+
},
|
16 |
+
"voice":{
|
17 |
+
"languageCode":"ja-JP",
|
18 |
+
"name":"ja-JP-Neural2-C",
|
19 |
+
"ssmlGender":"MALE"
|
20 |
+
},
|
21 |
+
"audioConfig":{
|
22 |
+
"audioEncoding":"MP3"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
response = requests.post(url, headers=headers, json=data)
|
26 |
+
print(response)
|
27 |
+
# response = response.json()
|
28 |
+
audio = response.content
|
29 |
+
#audio = base64.b64decode(audio)
|
30 |
+
# The response's audio_content is binary.
|
31 |
+
with open("test9.mp3", "wb") as out:
|
32 |
+
out.write(audio)
|
install.sh
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
URL=https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_cloud_sdk.bash
|
4 |
+
|
5 |
+
function download {
|
6 |
+
scratch="$(mktemp -d -t tmp.XXXXXXXXXX)" || exit
|
7 |
+
script_file="$scratch/install_google_cloud_sdk.bash"
|
8 |
+
|
9 |
+
echo "Downloading Google Cloud SDK install script: $URL"
|
10 |
+
curl -# "$URL" > "$script_file" || exit
|
11 |
+
chmod 775 "$script_file"
|
12 |
+
|
13 |
+
echo "Running install script from: $script_file"
|
14 |
+
"$script_file" "$@"
|
15 |
+
}
|
16 |
+
|
17 |
+
download "$@"
|
microsoftTTS.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import azure.cognitiveservices.speech as speechsdk
|
3 |
+
|
4 |
+
speech_key = "1f1ef0ce53b84261be94fab81df7e628"
|
5 |
+
service_region = "japaneast"
|
6 |
+
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
|
7 |
+
|
8 |
+
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
|
9 |
+
#speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
|
10 |
+
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
|
11 |
+
|
12 |
+
# The neural multilingual voice can speak different languages based on the input text.
|
13 |
+
speech_config.speech_synthesis_voice_name='en-US-AvaMultilingualNeural'
|
14 |
+
|
15 |
+
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
|
16 |
+
|
17 |
+
# Get text from the console and synthesize to the default speaker.
|
18 |
+
print("Enter some text that you want to speak >")
|
19 |
+
text = input()
|
20 |
+
|
21 |
+
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
|
22 |
+
|
23 |
+
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
24 |
+
print("Speech synthesized for text [{}]".format(text))
|
25 |
+
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
|
26 |
+
cancellation_details = speech_synthesis_result.cancellation_details
|
27 |
+
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
|
28 |
+
if cancellation_details.reason == speechsdk.CancellationReason.Error:
|
29 |
+
if cancellation_details.error_details:
|
30 |
+
print("Error details: {}".format(cancellation_details.error_details))
|
31 |
+
print("Did you set the speech resource key and region values?")
|
output.mp3
ADDED
Binary file (36.4 kB). View file
|
|
test.mp3
ADDED
Binary file (35.7 kB). View file
|
|
test.py
ADDED
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
warnings.filterwarnings("ignore")
|
3 |
+
|
4 |
+
# 外部库
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
import argparse
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
import re
|
11 |
+
import tempfile
|
12 |
+
import librosa
|
13 |
+
import numpy as np
|
14 |
+
# import torch
|
15 |
+
# from torch import no_grad, LongTensor
|
16 |
+
# import commons
|
17 |
+
import gradio as gr
|
18 |
+
import gradio.utils as gr_utils
|
19 |
+
import gradio.processing_utils as gr_processing_utils
|
20 |
+
|
21 |
+
# 内部库
|
22 |
+
# from models import SynthesizerTrn
|
23 |
+
# from text import text_to_sequence, text_to_sequence_for_test, _clean_text
|
24 |
+
# from mel_processing import spectrogram_torch
|
25 |
+
# import utils
|
26 |
+
# from text.symbols import symbols
|
27 |
+
all_example = "my voice is my passport verify me."
|
28 |
+
|
29 |
+
eleven_voice_id = [
|
30 |
+
"21m00Tcm4TlvDq8ikWAM",
|
31 |
+
"29vD33N1CtxCmqQRPOHJ",
|
32 |
+
"2EiwWnXFnvU5JabPnv8n",
|
33 |
+
"5Q0t7uMcjvnagumLfvZi",
|
34 |
+
"AZnzlk1XvdvUeBnXmlld",
|
35 |
+
"CYw3kZ02Hs0563khs1Fj",
|
36 |
+
"D38z5RcWu1voky8WS1ja",
|
37 |
+
"EXAVITQu4vr4xnSDxMaL",
|
38 |
+
"ErXwobaYiN019PkySvjV",
|
39 |
+
"GBv7mTt0atIp3Br8iCZE",
|
40 |
+
"IKne3meq5aSn9XLyUdCD",
|
41 |
+
"JBFqnCBsd6RMkjVDRZzb",
|
42 |
+
"LcfcDJNUP1GQjkzn1xUU",
|
43 |
+
"MF3mGyEYCl7XYWbV9V6O",
|
44 |
+
"N2lVS1w4EtoT3dr4eOWO",
|
45 |
+
"ODq5zmih8GrVes37Dizd",
|
46 |
+
"SOYHLrjzK2X1ezoPC6cr",
|
47 |
+
"TX3LPaxmHKxFdv7VOQHJ",
|
48 |
+
"ThT5KcBeYPX3keUQqHPh",
|
49 |
+
"TxGEqnHWrfWFTfGW9XjX",
|
50 |
+
"VR6AewLTigWG4xSOukaG",
|
51 |
+
"XB0fDUnXU5powFXDhCwa",
|
52 |
+
"Xb7hH8MSUJpSbSDYk0k2",
|
53 |
+
"XrExE9yKIg1WjnnlVkGX",
|
54 |
+
"ZQe5CZNOzWyzPSCn5a3c",
|
55 |
+
"Zlb1dXrM653N07WRdFW3",
|
56 |
+
"bVMeCyTHy58xNoL34h3p",
|
57 |
+
"flq6f7yk4E4fJM5XTYuZ",
|
58 |
+
"g5CIjZEefAph4nQFvHAz",
|
59 |
+
"iP95p4xoKVk53GoZ742B",
|
60 |
+
"jBpfuIE2acCO8z3wKNLl",
|
61 |
+
"jsCqWAovK2LkecY7zXl4",
|
62 |
+
"nPczCjzI2devNBz1zQrb",
|
63 |
+
"oWAxZDx7w5VEj9dCyTzz",
|
64 |
+
"onwK4e9ZLuTAKqWW03F9",
|
65 |
+
"pFZP5JQG7iQjIQuC4Bku",
|
66 |
+
"pMsXgVXv3BLzUgSXRplE",
|
67 |
+
"pNInz6obpgDQGcFmaJgB",
|
68 |
+
"piTKgcLEGmPE4e6mEKli",
|
69 |
+
"pqHfZKP75CvOlQylNhV4",
|
70 |
+
"t0jbNlBVZ17f02VDIeMI",
|
71 |
+
"yoZ06aMxZJJ28mfd3POQ",
|
72 |
+
"z9fAnlkpzviPz146aGWa",
|
73 |
+
"zcAOhNBS3c14rBihAFp1",
|
74 |
+
"zrHiDhphv9ZnVXBqCLjz",
|
75 |
+
]
|
76 |
+
|
77 |
+
eleven_name = [
|
78 |
+
"Rachel",
|
79 |
+
"Drew",
|
80 |
+
"Clyde",
|
81 |
+
"Paul",
|
82 |
+
"Domi",
|
83 |
+
"Dave",
|
84 |
+
"Fin",
|
85 |
+
"Sarah",
|
86 |
+
"Antoni",
|
87 |
+
"Thomas",
|
88 |
+
"Charlie",
|
89 |
+
"George",
|
90 |
+
"Emily",
|
91 |
+
"Elli",
|
92 |
+
"Callum",
|
93 |
+
"Patrick",
|
94 |
+
"Harry",
|
95 |
+
"Liam",
|
96 |
+
"Dorothy",
|
97 |
+
"Josh",
|
98 |
+
"Arnold",
|
99 |
+
"Charlotte",
|
100 |
+
"Alice",
|
101 |
+
"Matilda",
|
102 |
+
"James",
|
103 |
+
"Joseph",
|
104 |
+
"Jeremy",
|
105 |
+
"Michael",
|
106 |
+
"Ethan",
|
107 |
+
"Chris",
|
108 |
+
"Gigi",
|
109 |
+
"Freya",
|
110 |
+
"Brian",
|
111 |
+
"Grace",
|
112 |
+
"Daniel",
|
113 |
+
"Lily",
|
114 |
+
"Serena",
|
115 |
+
"Adam",
|
116 |
+
"Nicole",
|
117 |
+
"Bill",
|
118 |
+
"Jessie",
|
119 |
+
"Sam",
|
120 |
+
"Glinda",
|
121 |
+
"Giovanni",
|
122 |
+
"Mimi",
|
123 |
+
]
|
124 |
+
eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id))
|
125 |
+
|
126 |
+
def openai(text, name):
|
127 |
+
|
128 |
+
headers = {
|
129 |
+
'Authorization': 'Bearer ' + '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',
|
130 |
+
'Content-Type': 'application/json',
|
131 |
+
}
|
132 |
+
|
133 |
+
json_data = {
|
134 |
+
'model': 'tts-1-hd',
|
135 |
+
'input': f'{text}',
|
136 |
+
'voice': f'{name}',
|
137 |
+
}
|
138 |
+
|
139 |
+
response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data)
|
140 |
+
|
141 |
+
# Note: json_data will not be serialized by requests
|
142 |
+
# exactly as it was in the original request.
|
143 |
+
#data = '{\n "model": "tts-1",\n "input": "The quick brown fox jumped over the lazy dog.",\n "voice": "alloy"\n }'
|
144 |
+
#response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data)
|
145 |
+
|
146 |
+
return "Success", response
|
147 |
+
|
148 |
+
def elevenlabs(text,name):
|
149 |
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{name}"
|
150 |
+
CHUNK_SIZE = 1024
|
151 |
+
#url = "https://api.elevenlabs.io/v1/text-to-speech/<voice-id>"
|
152 |
+
|
153 |
+
headers = {
|
154 |
+
"Accept": "audio/mpeg",
|
155 |
+
"Content-Type": "application/json",
|
156 |
+
"xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe"
|
157 |
+
}
|
158 |
+
|
159 |
+
data = {
|
160 |
+
"text": f"{text}",
|
161 |
+
"model_id": "eleven_monolingual_v1",
|
162 |
+
"voice_settings": {
|
163 |
+
"stability": 0.5,
|
164 |
+
"similarity_boost": 0.5
|
165 |
+
}
|
166 |
+
}
|
167 |
+
|
168 |
+
response = requests.post(url, json=data, headers=headers)
|
169 |
+
# with open('output.mp3', 'wb') as f:
|
170 |
+
# for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
|
171 |
+
# if chunk:
|
172 |
+
# f.write(chunk)
|
173 |
+
return "Success", response
|
174 |
+
|
175 |
+
microsoft_model_list = [
|
176 |
+
"en-US-AvaMultilingualNeural"
|
177 |
+
]
|
178 |
+
|
179 |
+
def microsoft(text, name, style="Neural"):
|
180 |
+
"""
|
181 |
+
:param text:
|
182 |
+
:param name:
|
183 |
+
:param style:
|
184 |
+
:return:
|
185 |
+
"""
|
186 |
+
headers = {
|
187 |
+
'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
|
188 |
+
'Content-Type': 'application/ssml+xml',
|
189 |
+
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
|
190 |
+
'User-Agent': 'curl',
|
191 |
+
}
|
192 |
+
|
193 |
+
data = ("<speak version='1.0' xml:lang='en-US'>"
|
194 |
+
f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female'
|
195 |
+
f"{text}"
|
196 |
+
"</voice>"
|
197 |
+
"</speak>")
|
198 |
+
|
199 |
+
response = requests.post(
|
200 |
+
'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
|
201 |
+
headers=headers,
|
202 |
+
data=data,
|
203 |
+
)
|
204 |
+
# data = {
|
205 |
+
# "text":text,
|
206 |
+
# "name":name,
|
207 |
+
# "style":style,
|
208 |
+
# "format":"mp3"}
|
209 |
+
# audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url']
|
210 |
+
return "Success", response
|
211 |
+
|
212 |
+
# def google(text,name):
|
213 |
+
# # import subprocess
|
214 |
+
# # command1 = subprocess.run('gcloud auth print-access-token', shell=True, capture_output=True, text=True).stdout
|
215 |
+
|
216 |
+
# headers = {
|
217 |
+
# 'Authorization': 'Bearer ' + "synclub-2383kjhjksxfv.2341gs",
|
218 |
+
# 'x-goog-user-project': 'PROJECT_ID',
|
219 |
+
# 'Content-Type': 'application/json; charset=utf-8',
|
220 |
+
# }
|
221 |
+
|
222 |
+
# data = {
|
223 |
+
# "input": {
|
224 |
+
# "text": f"{text}"},
|
225 |
+
# "voice": {
|
226 |
+
# "languageCode": "en-gb",
|
227 |
+
# "name": "en-GB-Standard-A",
|
228 |
+
# "ssmlGender": "FEMALE"
|
229 |
+
# },
|
230 |
+
# "audioConfig": {
|
231 |
+
# "audioEncoding": "MP3"
|
232 |
+
# }
|
233 |
+
# }
|
234 |
+
|
235 |
+
# response = requests.post('https://texttospeech.googleapis.com/v1/text:synthesize', headers=headers, data=data)
|
236 |
+
# return "Success", response
|
237 |
+
if __name__ == '__main__':
|
238 |
+
parser = argparse.ArgumentParser()
|
239 |
+
parser.add_argument('--device', type=str, default='cuda')
|
240 |
+
parser.add_argument("--share", action="store_true", default=True, help="share gradio app")
|
241 |
+
parser.add_argument("--port", type=int, default=8081, help="port")
|
242 |
+
parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
|
243 |
+
args = parser.parse_args()
|
244 |
+
# app = gr.Blocks()
|
245 |
+
# with app:
|
246 |
+
# gr.Markdown("## Japanese TTS Demo")
|
247 |
+
# with gr.Tabs():
|
248 |
+
# with gr.TabItem("微软"):
|
249 |
+
# tts_input1 = gr.TextArea(label="Text", value=all_example)
|
250 |
+
# tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
|
251 |
+
# tts_submit = gr.Button("Generate", variant="primary")
|
252 |
+
# tts_output1 = gr.Textbox(label="Output Message")
|
253 |
+
# tts_output2 = gr.Audio(label="Output Audio")
|
254 |
+
# tts_submit.click(microsoft, [tts_input1, tts_input2],
|
255 |
+
# [tts_output1, tts_output2])
|
256 |
+
# _, audio = microsoft(all_example, 'en-US-AvaMultilingualNeural')
|
257 |
+
# _, audio = google(all_example,'alloy')
|
258 |
+
# print(audio)
|
259 |
+
# with open("test4.mp3", "wb") as f:
|
260 |
+
# f.write(audio.content)
|
261 |
+
#_, audio = elevenlabs(all_example, "21m00Tcm4TlvDq8ikWAM")
|
262 |
+
# print(audio)
|
263 |
+
# with open('output.mp3', 'wb') as f:
|
264 |
+
# for chunk in audio.iter_content(chunk_size=1024):
|
265 |
+
# if chunk:
|
266 |
+
# f.write(chunk)
|
267 |
+
|
268 |
+
# device = torch.device(args.device)
|
269 |
+
# models_tts = []
|
270 |
+
|
271 |
+
# with open(args.model_info_path, "r", encoding="utf-8") as f:
|
272 |
+
# models_info = json.load(f)
|
273 |
+
# for i, info in models_info.items():
|
274 |
+
# model_name = info["model_name"]
|
275 |
+
# author = info["author"]
|
276 |
+
# lang = info["lang"]
|
277 |
+
# example = info["example"]
|
278 |
+
# config_path = info["config_path"]
|
279 |
+
# model_path = info["model_path"]
|
280 |
+
# model_type = info["model_type"]
|
281 |
+
|
282 |
+
# hps = utils.get_hparams_from_file(config_path)
|
283 |
+
# if model_type == "vits":
|
284 |
+
# emotion_type = None
|
285 |
+
# elif model_type == "vits-emotion":
|
286 |
+
# emotion_type = "embedding"
|
287 |
+
# elif model_type == "vits-emotion-logits":
|
288 |
+
# emotion_type = "logits"
|
289 |
+
|
290 |
+
# model = SynthesizerTrn(
|
291 |
+
# len(symbols),
|
292 |
+
# hps.data.filter_length // 2 + 1,
|
293 |
+
# hps.train.segment_size // hps.data.hop_length,
|
294 |
+
# emotion_type=emotion_type,
|
295 |
+
# **hps.model)
|
296 |
+
|
297 |
+
# utils.load_checkpoint(model_path, model, None)
|
298 |
+
# model.eval().to(device)
|
299 |
+
# if model_type == "vits":
|
300 |
+
# # 普通TTS
|
301 |
+
# models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps)))
|
302 |
+
|
303 |
+
|
304 |
+
app = gr.Blocks()
|
305 |
+
with app:
|
306 |
+
gr.Markdown("## Japanese TTS Demo")
|
307 |
+
with gr.Tabs():
|
308 |
+
# with gr.TabItem("自研"):
|
309 |
+
# with gr.Tabs():
|
310 |
+
# for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts):
|
311 |
+
# with gr.TabItem(model_name):
|
312 |
+
# with gr.Column():
|
313 |
+
# tts_input1 = gr.TextArea(label="Text", value=example)
|
314 |
+
# tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1)
|
315 |
+
# tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1)
|
316 |
+
# tts_input4 = gr.Slider(label="noise_scale_w", value=0.0,
|
317 |
+
# minimum=0.0, maximum=2, step=0.1)
|
318 |
+
# tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1)
|
319 |
+
# tts_submit = gr.Button("Generate", variant="primary")
|
320 |
+
# tts_output1 = gr.Textbox(label="Output Message")
|
321 |
+
# tts_output2 = gr.Audio(label="Output Audio")
|
322 |
+
# tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5],
|
323 |
+
# [tts_output1, tts_output2])
|
324 |
+
|
325 |
+
# with gr.TabItem("谷歌"):
|
326 |
+
# tts_input1 = gr.TextArea(label="Text", value=all_example)
|
327 |
+
# tts_input2 = gr.Dropdown(google_model_list, label="name")
|
328 |
+
# tts_submit = gr.Button("Generate", variant="primary")
|
329 |
+
# tts_output1 = gr.Textbox(label="Output Message")
|
330 |
+
# tts_output2 = gr.Audio(label="Output Audio")
|
331 |
+
# tts_submit.click(google, [tts_input1, tts_input2],
|
332 |
+
# [tts_output1, tts_output2])
|
333 |
+
|
334 |
+
with gr.TabItem("微软"):
|
335 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
336 |
+
tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
|
337 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
338 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
339 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
340 |
+
tts_submit.click(microsoft, [tts_input1, tts_input2],
|
341 |
+
[tts_output1, tts_output2])
|
342 |
+
|
343 |
+
# with gr.TabItem("coefont"):
|
344 |
+
# tts_input1 = gr.TextArea(label="Text", value=all_example)
|
345 |
+
# tts_input2 = gr.Dropdown(coefont_model_list, label="name")
|
346 |
+
# tts_submit = gr.Button("Generate", variant="primary")
|
347 |
+
# tts_output1 = gr.Textbox(label="Output Message")
|
348 |
+
# tts_output2 = gr.Audio(label="Output Audio")
|
349 |
+
# tts_submit.click(coefont, [tts_input1, tts_input2],
|
350 |
+
# [tts_output1, tts_output2])
|
351 |
+
|
352 |
+
app.launch(show_api=False,
|
353 |
+
share=args.share,
|
354 |
+
server_name='0.0.0.0',
|
355 |
+
server_port=args.port,
|
356 |
+
show_error=True)
|
357 |
+
|
test2.mp3
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"detail":{"status":"invalid_uid","message":"An invalid ID has been received: '<string>'. Make sure to provide a correct one."}}
|
test3.mp3
ADDED
Binary file (43.2 kB). View file
|
|
test4.mp3
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"error": {
|
3 |
+
"code": 401,
|
4 |
+
"message": "Request had invalid authentication credentials. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project.",
|
5 |
+
"status": "UNAUTHENTICATED",
|
6 |
+
"details": [
|
7 |
+
{
|
8 |
+
"@type": "type.googleapis.com/google.rpc.ErrorInfo",
|
9 |
+
"reason": "ACCESS_TOKEN_TYPE_UNSUPPORTED",
|
10 |
+
"metadata": {
|
11 |
+
"method": "google.cloud.texttospeech.v1.TextToSpeech.SynthesizeSpeech",
|
12 |
+
"service": "texttospeech.googleapis.com"
|
13 |
+
}
|
14 |
+
}
|
15 |
+
]
|
16 |
+
}
|
17 |
+
}
|
test9.mp3
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"error": {
|
3 |
+
"code": 401,
|
4 |
+
"message": "Request had invalid authentication credentials. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project.",
|
5 |
+
"status": "UNAUTHENTICATED",
|
6 |
+
"details": [
|
7 |
+
{
|
8 |
+
"@type": "type.googleapis.com/google.rpc.ErrorInfo",
|
9 |
+
"reason": "ACCESS_TOKEN_TYPE_UNSUPPORTED",
|
10 |
+
"metadata": {
|
11 |
+
"service": "texttospeech.googleapis.com",
|
12 |
+
"method": "google.cloud.texttospeech.v1.TextToSpeech.SynthesizeSpeech"
|
13 |
+
}
|
14 |
+
}
|
15 |
+
]
|
16 |
+
}
|
17 |
+
}
|
test99.mp3
ADDED
Binary file (43.2 kB). View file
|
|
try.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import os
|
3 |
+
import base64
|
4 |
+
import json
|
5 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./api.json"
|
6 |
+
|
7 |
+
url = "https://texttospeech.googleapis.com/v1/text:synthesize"
|
8 |
+
headers = {'Content-Type': 'application/json; charset=utf-8',
|
9 |
+
'X-Goog-Api-Key': 'synclub-2383kjhjksxfv.2341gs' # 待补充
|
10 |
+
}
|
11 |
+
text = "二月の下旬に差し掛かる頃だった。"
|
12 |
+
|
13 |
+
data = {
|
14 |
+
"input":{
|
15 |
+
"text":text
|
16 |
+
},
|
17 |
+
"voice":{
|
18 |
+
"languageCode":"ja-JP",
|
19 |
+
"name":"ja-JP-Neural2-C",
|
20 |
+
"ssmlGender":"MALE"
|
21 |
+
},
|
22 |
+
"audioConfig":{
|
23 |
+
"audioEncoding":"MP3"
|
24 |
+
}
|
25 |
+
}
|
26 |
+
response = requests.post(url, headers=headers, json=data)
|
27 |
+
response = response.json()
|
28 |
+
print(response)
|
29 |
+
audio = response['audioContent']
|
30 |
+
audio = base64.b64decode(audio)
|
31 |
+
# The response's audio_content is binary.
|
32 |
+
with open("test.mp3", "wb") as out:
|
33 |
+
out.write(audio)
|
try2.py
ADDED
File without changes
|
tts_gradio.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
warnings.filterwarnings("ignore")
|
3 |
+
|
4 |
+
# 外部库
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
import argparse
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
import re
|
11 |
+
import tempfile
|
12 |
+
import librosa
|
13 |
+
import numpy as np
|
14 |
+
# import torch
|
15 |
+
# from torch import no_grad, LongTensor
|
16 |
+
# import commons
|
17 |
+
import gradio as gr
|
18 |
+
import gradio.utils as gr_utils
|
19 |
+
import gradio.processing_utils as gr_processing_utils
|
20 |
+
|
21 |
+
all_example = "my voice is my passport verify me."
|
22 |
+
|
23 |
+
microsoft_model_list = [
|
24 |
+
"en-US-AvaMultilingualNeural"
|
25 |
+
]
|
26 |
+
|
27 |
+
openai_model_list = [
|
28 |
+
"alloy",
|
29 |
+
"echo",
|
30 |
+
"fable",
|
31 |
+
"onyx",
|
32 |
+
"nova",
|
33 |
+
"shimmer"
|
34 |
+
]
|
35 |
+
|
36 |
+
eleven_voice_id = [
|
37 |
+
"21m00Tcm4TlvDq8ikWAM",
|
38 |
+
"29vD33N1CtxCmqQRPOHJ",
|
39 |
+
"2EiwWnXFnvU5JabPnv8n",
|
40 |
+
"5Q0t7uMcjvnagumLfvZi",
|
41 |
+
"AZnzlk1XvdvUeBnXmlld",
|
42 |
+
"CYw3kZ02Hs0563khs1Fj",
|
43 |
+
"D38z5RcWu1voky8WS1ja",
|
44 |
+
"EXAVITQu4vr4xnSDxMaL",
|
45 |
+
"ErXwobaYiN019PkySvjV",
|
46 |
+
"GBv7mTt0atIp3Br8iCZE",
|
47 |
+
"IKne3meq5aSn9XLyUdCD",
|
48 |
+
"JBFqnCBsd6RMkjVDRZzb",
|
49 |
+
"LcfcDJNUP1GQjkzn1xUU",
|
50 |
+
"MF3mGyEYCl7XYWbV9V6O",
|
51 |
+
"N2lVS1w4EtoT3dr4eOWO",
|
52 |
+
"ODq5zmih8GrVes37Dizd",
|
53 |
+
"SOYHLrjzK2X1ezoPC6cr",
|
54 |
+
"TX3LPaxmHKxFdv7VOQHJ",
|
55 |
+
"ThT5KcBeYPX3keUQqHPh",
|
56 |
+
"TxGEqnHWrfWFTfGW9XjX",
|
57 |
+
"VR6AewLTigWG4xSOukaG",
|
58 |
+
"XB0fDUnXU5powFXDhCwa",
|
59 |
+
"Xb7hH8MSUJpSbSDYk0k2",
|
60 |
+
"XrExE9yKIg1WjnnlVkGX",
|
61 |
+
"ZQe5CZNOzWyzPSCn5a3c",
|
62 |
+
"Zlb1dXrM653N07WRdFW3",
|
63 |
+
"bVMeCyTHy58xNoL34h3p",
|
64 |
+
"flq6f7yk4E4fJM5XTYuZ",
|
65 |
+
"g5CIjZEefAph4nQFvHAz",
|
66 |
+
"iP95p4xoKVk53GoZ742B",
|
67 |
+
"jBpfuIE2acCO8z3wKNLl",
|
68 |
+
"jsCqWAovK2LkecY7zXl4",
|
69 |
+
"nPczCjzI2devNBz1zQrb",
|
70 |
+
"oWAxZDx7w5VEj9dCyTzz",
|
71 |
+
"onwK4e9ZLuTAKqWW03F9",
|
72 |
+
"pFZP5JQG7iQjIQuC4Bku",
|
73 |
+
"pMsXgVXv3BLzUgSXRplE",
|
74 |
+
"pNInz6obpgDQGcFmaJgB",
|
75 |
+
"piTKgcLEGmPE4e6mEKli",
|
76 |
+
"pqHfZKP75CvOlQylNhV4",
|
77 |
+
"t0jbNlBVZ17f02VDIeMI",
|
78 |
+
"yoZ06aMxZJJ28mfd3POQ",
|
79 |
+
"z9fAnlkpzviPz146aGWa",
|
80 |
+
"zcAOhNBS3c14rBihAFp1",
|
81 |
+
"zrHiDhphv9ZnVXBqCLjz",
|
82 |
+
]
|
83 |
+
|
84 |
+
eleven_name = [
|
85 |
+
"Rachel",
|
86 |
+
"Drew",
|
87 |
+
"Clyde",
|
88 |
+
"Paul",
|
89 |
+
"Domi",
|
90 |
+
"Dave",
|
91 |
+
"Fin",
|
92 |
+
"Sarah",
|
93 |
+
"Antoni",
|
94 |
+
"Thomas",
|
95 |
+
"Charlie",
|
96 |
+
"George",
|
97 |
+
"Emily",
|
98 |
+
"Elli",
|
99 |
+
"Callum",
|
100 |
+
"Patrick",
|
101 |
+
"Harry",
|
102 |
+
"Liam",
|
103 |
+
"Dorothy",
|
104 |
+
"Josh",
|
105 |
+
"Arnold",
|
106 |
+
"Charlotte",
|
107 |
+
"Alice",
|
108 |
+
"Matilda",
|
109 |
+
"James",
|
110 |
+
"Joseph",
|
111 |
+
"Jeremy",
|
112 |
+
"Michael",
|
113 |
+
"Ethan",
|
114 |
+
"Chris",
|
115 |
+
"Gigi",
|
116 |
+
"Freya",
|
117 |
+
"Brian",
|
118 |
+
"Grace",
|
119 |
+
"Daniel",
|
120 |
+
"Lily",
|
121 |
+
"Serena",
|
122 |
+
"Adam",
|
123 |
+
"Nicole",
|
124 |
+
"Bill",
|
125 |
+
"Jessie",
|
126 |
+
"Sam",
|
127 |
+
"Glinda",
|
128 |
+
"Giovanni",
|
129 |
+
"Mimi",
|
130 |
+
]
|
131 |
+
eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id))
|
132 |
+
|
133 |
+
def openai(text, name):
|
134 |
+
|
135 |
+
headers = {
|
136 |
+
'Authorization': 'Bearer ' + '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',
|
137 |
+
'Content-Type': 'application/json',
|
138 |
+
}
|
139 |
+
|
140 |
+
json_data = {
|
141 |
+
'model': 'tts-1-hd',
|
142 |
+
'input': f'{text}',
|
143 |
+
'voice': f'{name}',
|
144 |
+
}
|
145 |
+
|
146 |
+
response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data)
|
147 |
+
|
148 |
+
# Note: json_data will not be serialized by requests
|
149 |
+
# exactly as it was in the original request.
|
150 |
+
#data = '{\n "model": "tts-1",\n "input": "The quick brown fox jumped over the lazy dog.",\n "voice": "alloy"\n }'
|
151 |
+
#response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data)
|
152 |
+
|
153 |
+
return "Success", response
|
154 |
+
|
155 |
+
def elevenlabs(text,name):
|
156 |
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{eleven_id_model_name_dict[name]}"
|
157 |
+
CHUNK_SIZE = 1024
|
158 |
+
#url = "https://api.elevenlabs.io/v1/text-to-speech/<voice-id>"
|
159 |
+
|
160 |
+
headers = {
|
161 |
+
"Accept": "audio/mpeg",
|
162 |
+
"Content-Type": "application/json",
|
163 |
+
"xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe"
|
164 |
+
}
|
165 |
+
|
166 |
+
data = {
|
167 |
+
"text": f"{text}",
|
168 |
+
"model_id": "eleven_monolingual_v1",
|
169 |
+
"voice_settings": {
|
170 |
+
"stability": 0.5,
|
171 |
+
"similarity_boost": 0.5
|
172 |
+
}
|
173 |
+
}
|
174 |
+
|
175 |
+
response = requests.post(url, json=data, headers=headers)
|
176 |
+
# with open('output.mp3', 'wb') as f:
|
177 |
+
# for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
|
178 |
+
# if chunk:
|
179 |
+
# f.write(chunk)
|
180 |
+
return "Success", response
|
181 |
+
|
182 |
+
def microsoft(text, name, style="Neural"):
|
183 |
+
"""
|
184 |
+
:param text:
|
185 |
+
:param name:
|
186 |
+
:param style:
|
187 |
+
:return:
|
188 |
+
"""
|
189 |
+
headers = {
|
190 |
+
'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
|
191 |
+
'Content-Type': 'application/ssml+xml',
|
192 |
+
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
|
193 |
+
'User-Agent': 'curl',
|
194 |
+
}
|
195 |
+
|
196 |
+
data = ("<speak version='1.0' xml:lang='en-US'>"
|
197 |
+
f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female'
|
198 |
+
f"{text}"
|
199 |
+
"</voice>"
|
200 |
+
"</speak>")
|
201 |
+
|
202 |
+
response = requests.post(
|
203 |
+
'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
|
204 |
+
headers=headers,
|
205 |
+
data=data,
|
206 |
+
)
|
207 |
+
return "Success", "sss"
|
208 |
+
|
209 |
+
if __name__ == '__main__':
|
210 |
+
parser = argparse.ArgumentParser()
|
211 |
+
parser.add_argument('--device', type=str, default='cuda')
|
212 |
+
parser.add_argument("--share", action="store_true", default=True, help="share gradio app")
|
213 |
+
parser.add_argument("--port", type=int, default=8081, help="port")
|
214 |
+
parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
|
215 |
+
args = parser.parse_args()
|
216 |
+
|
217 |
+
app = gr.Blocks()
|
218 |
+
with app:
|
219 |
+
gr.Markdown("## Japanese TTS Demo")
|
220 |
+
with gr.Tabs():
|
221 |
+
|
222 |
+
with gr.TabItem("11Labs"):
|
223 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
224 |
+
tts_input2 = gr.Dropdown(eleven_name, label="name")
|
225 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
226 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
227 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
228 |
+
tts_submit.click(elevenlabs, [tts_input1, tts_input2],
|
229 |
+
[tts_output1, tts_output2])
|
230 |
+
|
231 |
+
with gr.TabItem("微软"):
|
232 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
233 |
+
tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
|
234 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
235 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
236 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
237 |
+
tts_submit.click(microsoft, [tts_input1, tts_input2],
|
238 |
+
[tts_output1, tts_output2])
|
239 |
+
|
240 |
+
with gr.TabItem("openai"):
|
241 |
+
tts_input1 = gr.TextArea(label="Text", value=all_example)
|
242 |
+
tts_input2 = gr.Dropdown(openai_model_list, label="name")
|
243 |
+
tts_submit = gr.Button("Generate", variant="primary")
|
244 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
245 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
246 |
+
tts_submit.click(openai, [tts_input1, tts_input2],
|
247 |
+
[tts_output1, tts_output2])
|
248 |
+
|
249 |
+
app.queue(max_size=10)
|
250 |
+
app.launch(share=True)
|
251 |
+
# _, audio = openai(all_example,'alloy')
|
252 |
+
# print(audio)
|
253 |
+
# with open("test99.mp3", "wb") as f:
|
254 |
+
# f.write(audio.content)
|
255 |
+
|