Initial commit
Browse files- .env-sample +3 -0
- .gitignore +3 -0
- Dockerfile +29 -0
- config.py +15 -0
- models/lightspeech_processor.json +1 -0
- models/lightspeech_quant.onnx +3 -0
- models/mbmelgan.onnx +3 -0
- outputs/.gitignore +3 -0
- outputs/README.md +3 -0
- requirements.txt +16 -0
- server.py +51 -0
- tests/POST_text_to_speech.http +11 -0
- tts.py +150 -0
.env-sample
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
APP_PORT=8080
|
2 |
+
DEBUG=true
|
3 |
+
BASE_URL=http://localhost:8080
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
|
3 |
+
.env
|
Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python image from the Docker Hub
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install git and git-lfs
|
8 |
+
RUN apt-get update \
|
9 |
+
&& apt-get install -y git git-lfs libsndfile1 \
|
10 |
+
&& apt-get clean \
|
11 |
+
&& rm -rf /var/lib/apt/lists/* \
|
12 |
+
&& git lfs install
|
13 |
+
|
14 |
+
# Copy the requirements file into the container
|
15 |
+
COPY requirements.txt .
|
16 |
+
|
17 |
+
# Install the required packages
|
18 |
+
RUN pip install --no-cache-dir -r requirements.txt \
|
19 |
+
# Install gruut[sw] separately
|
20 |
+
&& pip install -f 'https://synesthesiam.github.io/prebuilt-apps/' 'gruut[sw]'
|
21 |
+
|
22 |
+
# Copy the rest of the application code into the container
|
23 |
+
COPY . .
|
24 |
+
|
25 |
+
# Expose the port the app runs on
|
26 |
+
EXPOSE 8080
|
27 |
+
|
28 |
+
# Run the application
|
29 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:8080", "server:app"]
|
config.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
# Load environment variables from .env file
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
# Configuration settings
|
8 |
+
|
9 |
+
class Config:
|
10 |
+
ENVIRONMENT = os.getenv('APP_ENV')
|
11 |
+
DEBUG = os.getenv('DEBUG') != 'false'
|
12 |
+
SECRET = os.getenv('SECRET_KEY')
|
13 |
+
HOST = os.getenv('APP_HOST') or '0.0.0.0'
|
14 |
+
PORT = int(os.getenv('APP_PORT') or 8080)
|
15 |
+
BASE_URL = os.getenv('BASE_URL') or 'http://localhost:8080'
|
models/lightspeech_processor.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"symbol_to_id": {"@PAD": 0, "@f": 1, "@h": 2, "@i": 3, "@j": 4, "@k": 5, "@l": 6, "@m": 7, "@n": 8, "@p": 9, "@s": 10, "@t": 11, "@t\u0361\u0283": 12, "@u": 13, "@v": 14, "@w": 15, "@x": 16, "@z": 17, "@\u00f0": 18, "@\u014b": 19, "@\u0251": 20, "@\u0253": 21, "@\u0254": 22, "@\u0257": 23, "@\u025b": 24, "@\u0260": 25, "@\u0263": 26, "@\u027e": 27, "@\u0283": 28, "@\u0284": 29, "@\u03b8": 30, "@\u1d50\u0253": 31, "@\u1d51g": 32, "@\u1dacv": 33, "@\u207fz": 34, "@\u207f\u0257": 35, "@\u207f\u0257\u0361\u0292": 36, "!": 37, ",": 38, ".": 39, "?": 40, ";": 41, ":": 42, "@SIL": 43, "@EOS": 44}, "id_to_symbol": {"0": "@PAD", "1": "@f", "2": "@h", "3": "@i", "4": "@j", "5": "@k", "6": "@l", "7": "@m", "8": "@n", "9": "@p", "10": "@s", "11": "@t", "12": "@t\u0361\u0283", "13": "@u", "14": "@v", "15": "@w", "16": "@x", "17": "@z", "18": "@\u00f0", "19": "@\u014b", "20": "@\u0251", "21": "@\u0253", "22": "@\u0254", "23": "@\u0257", "24": "@\u025b", "25": "@\u0260", "26": "@\u0263", "27": "@\u027e", "28": "@\u0283", "29": "@\u0284", "30": "@\u03b8", "31": "@\u1d50\u0253", "32": "@\u1d51g", "33": "@\u1dacv", "34": "@\u207fz", "35": "@\u207f\u0257", "36": "@\u207f\u0257\u0361\u0292", "37": "!", "38": ",", "39": ".", "40": "?", "41": ";", "42": ":", "43": "@SIL", "44": "@EOS"}, "speakers_map": {"sw-TZ-Victoria": 0}, "processor_name": "SwahiliIPAProcessor"}
|
models/lightspeech_quant.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f9878e0a686f5237d57364e89acda8126a3da7b231453eb4d419492653a366c
|
3 |
+
size 4663604
|
models/mbmelgan.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96ad87ee030197df993242eed4521bc3fe5fda43df778c1144c7ed50252a6bb3
|
3 |
+
size 10459516
|
outputs/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*
|
2 |
+
!.gitignore
|
3 |
+
!README.md
|
outputs/README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Outputs Folders
|
2 |
+
|
3 |
+
Place holder folder for model run outputs.
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask==2.3.2
|
2 |
+
numpy==1.25.0
|
3 |
+
gunicorn==20.1.0
|
4 |
+
gruut
|
5 |
+
onnxruntime==1.15.1
|
6 |
+
soundfile==0.12.1
|
7 |
+
IPython==8.12.0
|
8 |
+
pydub
|
9 |
+
nltk
|
10 |
+
python-dotenv
|
11 |
+
flask_cors
|
12 |
+
gruut[sw] -f https://synesthesiam.github.io/prebuilt-apps
|
13 |
+
|
14 |
+
librosa
|
15 |
+
phonemizer
|
16 |
+
g2p_id
|
server.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, send_from_directory, request, jsonify
|
2 |
+
from flask_cors import CORS
|
3 |
+
|
4 |
+
app = Flask(__name__)
|
5 |
+
CORS(app) # Enable CORS for all routes
|
6 |
+
|
7 |
+
# Import the TTS class
|
8 |
+
from tts import TTS
|
9 |
+
import numpy as np
|
10 |
+
import onnxruntime as ort
|
11 |
+
from pathlib import Path
|
12 |
+
import datetime
|
13 |
+
from config import Config
|
14 |
+
|
15 |
+
@app.route('/')
|
16 |
+
def index():
|
17 |
+
return 'Server is up!'
|
18 |
+
|
19 |
+
|
20 |
+
@app.route('/text_to_speech', methods=['POST'])
|
21 |
+
def fingerprint_verify():
|
22 |
+
try:
|
23 |
+
data = request.json
|
24 |
+
input_text = data.get('text')
|
25 |
+
|
26 |
+
try:
|
27 |
+
audio_array = TTS.generate(input_text)
|
28 |
+
|
29 |
+
now = datetime.datetime.now()
|
30 |
+
now_str = now.strftime("%Y%m%d_%H%M%S")
|
31 |
+
file_name = f"output_{now_str}.wav"
|
32 |
+
file_path = f"./outputs/{file_name}"
|
33 |
+
|
34 |
+
# Save the audio to a file
|
35 |
+
TTS.save_audio(audio_array, file_path)
|
36 |
+
|
37 |
+
audio_url = f"{Config.BASE_URL}/audio/{file_name}";
|
38 |
+
|
39 |
+
return dict(success=True, audio_url=audio_url)
|
40 |
+
except Exception as e:
|
41 |
+
return dict(success=False, error=str(e))
|
42 |
+
except Exception as e:
|
43 |
+
return dict(success=False, error=str(e))
|
44 |
+
|
45 |
+
|
46 |
+
@app.route('/audio/<path:path>')
|
47 |
+
def send_audio(path):
|
48 |
+
return send_from_directory('outputs', path)
|
49 |
+
|
50 |
+
if __name__ == '__main__':
|
51 |
+
app.run(host=Config.HOST, port=Config.PORT, debug=Config.DEBUG)
|
tests/POST_text_to_speech.http
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Submit text
|
2 |
+
POST http://localhost:8080/text_to_speech
|
3 |
+
Content-Type: application/json
|
4 |
+
|
5 |
+
{
|
6 |
+
"text": "Kiswahili lugha yangu"
|
7 |
+
}
|
8 |
+
|
9 |
+
|
10 |
+
### Fetch audio
|
11 |
+
GET http://localhost:8080/audio/output_20241206_100849.wav
|
tts.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# tts.py
|
2 |
+
|
3 |
+
from gruut import sentences
|
4 |
+
import re
|
5 |
+
import numpy as np
|
6 |
+
import onnxruntime as ort
|
7 |
+
from pathlib import Path
|
8 |
+
import json
|
9 |
+
import string
|
10 |
+
from IPython.display import Audio
|
11 |
+
import soundfile as sf
|
12 |
+
|
13 |
+
# Load models
|
14 |
+
lightspeech = ort.InferenceSession("./models/lightspeech_quant.onnx")
|
15 |
+
mbmelgan = ort.InferenceSession("./models/mbmelgan.onnx")
|
16 |
+
lightspeech_processor_config = Path("./models/lightspeech_processor.json")
|
17 |
+
|
18 |
+
with open(lightspeech_processor_config, "r") as f:
|
19 |
+
processor = json.load(f)
|
20 |
+
tokenizer = processor["symbol_to_id"]
|
21 |
+
|
22 |
+
class TTS:
|
23 |
+
@staticmethod
|
24 |
+
def generate(text: str) -> np.ndarray:
|
25 |
+
sections = TTS.split_text(text)
|
26 |
+
audio_sections = TTS.generate_speech_for_sections(sections)
|
27 |
+
concatenated_audio = TTS.concatenate_audio_sections(audio_sections)
|
28 |
+
return concatenated_audio
|
29 |
+
|
30 |
+
@staticmethod
|
31 |
+
def split_text(text: str) -> list:
|
32 |
+
# Split the text into sentences based on punctuation marks
|
33 |
+
sentences = re.split(r'(?<=[.!?])\s*', text)
|
34 |
+
sections = []
|
35 |
+
|
36 |
+
for sentence in sentences:
|
37 |
+
# Split each sentence by commas for short pauses
|
38 |
+
parts = re.split(r',\s*', sentence)
|
39 |
+
for i, part in enumerate(parts):
|
40 |
+
sections.append(part.strip())
|
41 |
+
if i < len(parts) - 1:
|
42 |
+
sections.append('*') # Short pause marker
|
43 |
+
sections.append('**') # Long pause marker after each sentence
|
44 |
+
|
45 |
+
# Remove empty sections
|
46 |
+
sections = [section for section in sections if section]
|
47 |
+
|
48 |
+
return sections
|
49 |
+
|
50 |
+
@staticmethod
|
51 |
+
def generate_speech_for_sections(sections: list) -> list:
|
52 |
+
audio_sections = []
|
53 |
+
for section in sections:
|
54 |
+
if section == '**':
|
55 |
+
# Long pause
|
56 |
+
pause_duration = 1.0
|
57 |
+
sample_rate = 44100
|
58 |
+
pause = np.zeros(int(pause_duration * sample_rate))
|
59 |
+
audio_sections.append(pause)
|
60 |
+
elif section == '*':
|
61 |
+
# Short pause
|
62 |
+
pause_duration = 0.4
|
63 |
+
sample_rate = 44100
|
64 |
+
pause = np.zeros(int(pause_duration * sample_rate))
|
65 |
+
audio_sections.append(pause)
|
66 |
+
else:
|
67 |
+
mel_output, durations = TTS.text2mel(section)
|
68 |
+
audio_array = TTS.mel2wav(mel_output)
|
69 |
+
audio_sections.append(audio_array)
|
70 |
+
return audio_sections
|
71 |
+
|
72 |
+
@staticmethod
|
73 |
+
def concatenate_audio_sections(audio_sections: list) -> np.ndarray:
|
74 |
+
concatenated_audio = np.concatenate(audio_sections)
|
75 |
+
return concatenated_audio
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
@staticmethod
|
80 |
+
def phonemize(word: str) -> str:
|
81 |
+
ipa = []
|
82 |
+
for words in sentences(word, lang="sw"):
|
83 |
+
for word in words:
|
84 |
+
if word.is_major_break or word.is_minor_break:
|
85 |
+
ipa += [word.text]
|
86 |
+
continue
|
87 |
+
|
88 |
+
phonemes = word.phonemes[:]
|
89 |
+
NG_GRAPHEME = "ng'"
|
90 |
+
NG_PRENASALIZED_PHONEME = "ᵑg"
|
91 |
+
NG_PHONEME = "ŋ"
|
92 |
+
if NG_GRAPHEME in word.text:
|
93 |
+
ng_graphemes = re.findall(f"{NG_GRAPHEME}?", word.text)
|
94 |
+
ng_phonemes_idx = [i for i, p in enumerate(phonemes) if p == NG_PRENASALIZED_PHONEME]
|
95 |
+
assert len(ng_graphemes) == len(ng_phonemes_idx)
|
96 |
+
for i, g in zip(ng_phonemes_idx, ng_graphemes):
|
97 |
+
phonemes[i] = NG_PHONEME if g == NG_GRAPHEME else phonemes[i]
|
98 |
+
|
99 |
+
ipa += phonemes
|
100 |
+
return ipa
|
101 |
+
|
102 |
+
@staticmethod
|
103 |
+
def tokenize(phonemes):
|
104 |
+
input_ids = []
|
105 |
+
for phoneme in phonemes:
|
106 |
+
if all(c in string.punctuation for c in phoneme):
|
107 |
+
input_ids.append(tokenizer[phoneme])
|
108 |
+
else:
|
109 |
+
input_ids.append(tokenizer[f"@{phoneme}"])
|
110 |
+
return input_ids
|
111 |
+
|
112 |
+
@staticmethod
|
113 |
+
def text2mel(text: str) -> tuple:
|
114 |
+
phonemes = TTS.phonemize(text)
|
115 |
+
input_ids = TTS.tokenize(phonemes)
|
116 |
+
|
117 |
+
inputs = {
|
118 |
+
"input_ids": np.array([input_ids], dtype=np.int32),
|
119 |
+
"speaker_ids": np.array([0], dtype=np.int32),
|
120 |
+
"speed_ratios": np.array([1.0], dtype=np.float32),
|
121 |
+
"f0_ratios": np.array([1.0], dtype=np.float32),
|
122 |
+
"energy_ratios": np.array([1.0], dtype=np.float32),
|
123 |
+
}
|
124 |
+
|
125 |
+
mel_output, durations, _ = lightspeech.run(None, inputs)
|
126 |
+
return mel_output, durations
|
127 |
+
|
128 |
+
@staticmethod
|
129 |
+
def mel2wav(mel_output: np.ndarray) -> np.ndarray:
|
130 |
+
# Prepare input for vocoder model
|
131 |
+
inputs = {
|
132 |
+
"mels": mel_output,
|
133 |
+
}
|
134 |
+
|
135 |
+
# Run inference
|
136 |
+
outputs = mbmelgan.run(None, inputs)
|
137 |
+
audio_array = outputs[0][0, :, 0]
|
138 |
+
|
139 |
+
return audio_array
|
140 |
+
|
141 |
+
@staticmethod
|
142 |
+
def synthesize(text: str) -> np.ndarray:
|
143 |
+
mel_output, _ = TTS.text2mel(text)
|
144 |
+
audio_array = TTS.mel2wav(mel_output)
|
145 |
+
return audio_array
|
146 |
+
|
147 |
+
@staticmethod
|
148 |
+
def save_audio(audio_array: np.ndarray, path: str):
|
149 |
+
sf.write(path, audio_array, 44100)
|
150 |
+
|