mosha255 commited on
Commit
fc37b9e
·
unverified ·
1 Parent(s): 43eda61

Initial commit

Browse files
.env-sample ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ APP_PORT=8080
2
+ DEBUG=true
3
+ BASE_URL=http://localhost:8080
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+
3
+ .env
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image from the Docker Hub
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install git and git-lfs
8
+ RUN apt-get update \
9
+ && apt-get install -y git git-lfs libsndfile1 \
10
+ && apt-get clean \
11
+ && rm -rf /var/lib/apt/lists/* \
12
+ && git lfs install
13
+
14
+ # Copy the requirements file into the container
15
+ COPY requirements.txt .
16
+
17
+ # Install the required packages
18
+ RUN pip install --no-cache-dir -r requirements.txt \
19
+ # Install gruut[sw] separately
20
+ && pip install -f 'https://synesthesiam.github.io/prebuilt-apps/' 'gruut[sw]'
21
+
22
+ # Copy the rest of the application code into the container
23
+ COPY . .
24
+
25
+ # Expose the port the app runs on
26
+ EXPOSE 8080
27
+
28
+ # Run the application
29
+ CMD ["gunicorn", "--bind", "0.0.0.0:8080", "server:app"]
config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables from .env file
5
+ load_dotenv()
6
+
7
+ # Configuration settings
8
+
9
+ class Config:
10
+ ENVIRONMENT = os.getenv('APP_ENV')
11
+ DEBUG = os.getenv('DEBUG') != 'false'
12
+ SECRET = os.getenv('SECRET_KEY')
13
+ HOST = os.getenv('APP_HOST') or '0.0.0.0'
14
+ PORT = int(os.getenv('APP_PORT') or 8080)
15
+ BASE_URL = os.getenv('BASE_URL') or 'http://localhost:8080'
models/lightspeech_processor.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"symbol_to_id": {"@PAD": 0, "@f": 1, "@h": 2, "@i": 3, "@j": 4, "@k": 5, "@l": 6, "@m": 7, "@n": 8, "@p": 9, "@s": 10, "@t": 11, "@t\u0361\u0283": 12, "@u": 13, "@v": 14, "@w": 15, "@x": 16, "@z": 17, "@\u00f0": 18, "@\u014b": 19, "@\u0251": 20, "@\u0253": 21, "@\u0254": 22, "@\u0257": 23, "@\u025b": 24, "@\u0260": 25, "@\u0263": 26, "@\u027e": 27, "@\u0283": 28, "@\u0284": 29, "@\u03b8": 30, "@\u1d50\u0253": 31, "@\u1d51g": 32, "@\u1dacv": 33, "@\u207fz": 34, "@\u207f\u0257": 35, "@\u207f\u0257\u0361\u0292": 36, "!": 37, ",": 38, ".": 39, "?": 40, ";": 41, ":": 42, "@SIL": 43, "@EOS": 44}, "id_to_symbol": {"0": "@PAD", "1": "@f", "2": "@h", "3": "@i", "4": "@j", "5": "@k", "6": "@l", "7": "@m", "8": "@n", "9": "@p", "10": "@s", "11": "@t", "12": "@t\u0361\u0283", "13": "@u", "14": "@v", "15": "@w", "16": "@x", "17": "@z", "18": "@\u00f0", "19": "@\u014b", "20": "@\u0251", "21": "@\u0253", "22": "@\u0254", "23": "@\u0257", "24": "@\u025b", "25": "@\u0260", "26": "@\u0263", "27": "@\u027e", "28": "@\u0283", "29": "@\u0284", "30": "@\u03b8", "31": "@\u1d50\u0253", "32": "@\u1d51g", "33": "@\u1dacv", "34": "@\u207fz", "35": "@\u207f\u0257", "36": "@\u207f\u0257\u0361\u0292", "37": "!", "38": ",", "39": ".", "40": "?", "41": ";", "42": ":", "43": "@SIL", "44": "@EOS"}, "speakers_map": {"sw-TZ-Victoria": 0}, "processor_name": "SwahiliIPAProcessor"}
models/lightspeech_quant.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f9878e0a686f5237d57364e89acda8126a3da7b231453eb4d419492653a366c
3
+ size 4663604
models/mbmelgan.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96ad87ee030197df993242eed4521bc3fe5fda43df778c1144c7ed50252a6bb3
3
+ size 10459516
outputs/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *
2
+ !.gitignore
3
+ !README.md
outputs/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Outputs Folders
2
+
3
+ Place holder folder for model run outputs.
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==2.3.2
2
+ numpy==1.25.0
3
+ gunicorn==20.1.0
4
+ gruut
5
+ onnxruntime==1.15.1
6
+ soundfile==0.12.1
7
+ IPython==8.12.0
8
+ pydub
9
+ nltk
10
+ python-dotenv
11
+ flask_cors
12
+ gruut[sw] -f https://synesthesiam.github.io/prebuilt-apps
13
+
14
+ librosa
15
+ phonemizer
16
+ g2p_id
server.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, send_from_directory, request, jsonify
2
+ from flask_cors import CORS
3
+
4
+ app = Flask(__name__)
5
+ CORS(app) # Enable CORS for all routes
6
+
7
+ # Import the TTS class
8
+ from tts import TTS
9
+ import numpy as np
10
+ import onnxruntime as ort
11
+ from pathlib import Path
12
+ import datetime
13
+ from config import Config
14
+
15
+ @app.route('/')
16
+ def index():
17
+ return 'Server is up!'
18
+
19
+
20
+ @app.route('/text_to_speech', methods=['POST'])
21
+ def fingerprint_verify():
22
+ try:
23
+ data = request.json
24
+ input_text = data.get('text')
25
+
26
+ try:
27
+ audio_array = TTS.generate(input_text)
28
+
29
+ now = datetime.datetime.now()
30
+ now_str = now.strftime("%Y%m%d_%H%M%S")
31
+ file_name = f"output_{now_str}.wav"
32
+ file_path = f"./outputs/{file_name}"
33
+
34
+ # Save the audio to a file
35
+ TTS.save_audio(audio_array, file_path)
36
+
37
+ audio_url = f"{Config.BASE_URL}/audio/{file_name}";
38
+
39
+ return dict(success=True, audio_url=audio_url)
40
+ except Exception as e:
41
+ return dict(success=False, error=str(e))
42
+ except Exception as e:
43
+ return dict(success=False, error=str(e))
44
+
45
+
46
+ @app.route('/audio/<path:path>')
47
+ def send_audio(path):
48
+ return send_from_directory('outputs', path)
49
+
50
+ if __name__ == '__main__':
51
+ app.run(host=Config.HOST, port=Config.PORT, debug=Config.DEBUG)
tests/POST_text_to_speech.http ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Submit text
2
+ POST http://localhost:8080/text_to_speech
3
+ Content-Type: application/json
4
+
5
+ {
6
+ "text": "Kiswahili lugha yangu"
7
+ }
8
+
9
+
10
+ ### Fetch audio
11
+ GET http://localhost:8080/audio/output_20241206_100849.wav
tts.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tts.py
2
+
3
+ from gruut import sentences
4
+ import re
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+ from pathlib import Path
8
+ import json
9
+ import string
10
+ from IPython.display import Audio
11
+ import soundfile as sf
12
+
13
+ # Load models
14
+ lightspeech = ort.InferenceSession("./models/lightspeech_quant.onnx")
15
+ mbmelgan = ort.InferenceSession("./models/mbmelgan.onnx")
16
+ lightspeech_processor_config = Path("./models/lightspeech_processor.json")
17
+
18
+ with open(lightspeech_processor_config, "r") as f:
19
+ processor = json.load(f)
20
+ tokenizer = processor["symbol_to_id"]
21
+
22
+ class TTS:
23
+ @staticmethod
24
+ def generate(text: str) -> np.ndarray:
25
+ sections = TTS.split_text(text)
26
+ audio_sections = TTS.generate_speech_for_sections(sections)
27
+ concatenated_audio = TTS.concatenate_audio_sections(audio_sections)
28
+ return concatenated_audio
29
+
30
+ @staticmethod
31
+ def split_text(text: str) -> list:
32
+ # Split the text into sentences based on punctuation marks
33
+ sentences = re.split(r'(?<=[.!?])\s*', text)
34
+ sections = []
35
+
36
+ for sentence in sentences:
37
+ # Split each sentence by commas for short pauses
38
+ parts = re.split(r',\s*', sentence)
39
+ for i, part in enumerate(parts):
40
+ sections.append(part.strip())
41
+ if i < len(parts) - 1:
42
+ sections.append('*') # Short pause marker
43
+ sections.append('**') # Long pause marker after each sentence
44
+
45
+ # Remove empty sections
46
+ sections = [section for section in sections if section]
47
+
48
+ return sections
49
+
50
+ @staticmethod
51
+ def generate_speech_for_sections(sections: list) -> list:
52
+ audio_sections = []
53
+ for section in sections:
54
+ if section == '**':
55
+ # Long pause
56
+ pause_duration = 1.0
57
+ sample_rate = 44100
58
+ pause = np.zeros(int(pause_duration * sample_rate))
59
+ audio_sections.append(pause)
60
+ elif section == '*':
61
+ # Short pause
62
+ pause_duration = 0.4
63
+ sample_rate = 44100
64
+ pause = np.zeros(int(pause_duration * sample_rate))
65
+ audio_sections.append(pause)
66
+ else:
67
+ mel_output, durations = TTS.text2mel(section)
68
+ audio_array = TTS.mel2wav(mel_output)
69
+ audio_sections.append(audio_array)
70
+ return audio_sections
71
+
72
+ @staticmethod
73
+ def concatenate_audio_sections(audio_sections: list) -> np.ndarray:
74
+ concatenated_audio = np.concatenate(audio_sections)
75
+ return concatenated_audio
76
+
77
+
78
+
79
+ @staticmethod
80
+ def phonemize(word: str) -> str:
81
+ ipa = []
82
+ for words in sentences(word, lang="sw"):
83
+ for word in words:
84
+ if word.is_major_break or word.is_minor_break:
85
+ ipa += [word.text]
86
+ continue
87
+
88
+ phonemes = word.phonemes[:]
89
+ NG_GRAPHEME = "ng'"
90
+ NG_PRENASALIZED_PHONEME = "ᵑg"
91
+ NG_PHONEME = "ŋ"
92
+ if NG_GRAPHEME in word.text:
93
+ ng_graphemes = re.findall(f"{NG_GRAPHEME}?", word.text)
94
+ ng_phonemes_idx = [i for i, p in enumerate(phonemes) if p == NG_PRENASALIZED_PHONEME]
95
+ assert len(ng_graphemes) == len(ng_phonemes_idx)
96
+ for i, g in zip(ng_phonemes_idx, ng_graphemes):
97
+ phonemes[i] = NG_PHONEME if g == NG_GRAPHEME else phonemes[i]
98
+
99
+ ipa += phonemes
100
+ return ipa
101
+
102
+ @staticmethod
103
+ def tokenize(phonemes):
104
+ input_ids = []
105
+ for phoneme in phonemes:
106
+ if all(c in string.punctuation for c in phoneme):
107
+ input_ids.append(tokenizer[phoneme])
108
+ else:
109
+ input_ids.append(tokenizer[f"@{phoneme}"])
110
+ return input_ids
111
+
112
+ @staticmethod
113
+ def text2mel(text: str) -> tuple:
114
+ phonemes = TTS.phonemize(text)
115
+ input_ids = TTS.tokenize(phonemes)
116
+
117
+ inputs = {
118
+ "input_ids": np.array([input_ids], dtype=np.int32),
119
+ "speaker_ids": np.array([0], dtype=np.int32),
120
+ "speed_ratios": np.array([1.0], dtype=np.float32),
121
+ "f0_ratios": np.array([1.0], dtype=np.float32),
122
+ "energy_ratios": np.array([1.0], dtype=np.float32),
123
+ }
124
+
125
+ mel_output, durations, _ = lightspeech.run(None, inputs)
126
+ return mel_output, durations
127
+
128
+ @staticmethod
129
+ def mel2wav(mel_output: np.ndarray) -> np.ndarray:
130
+ # Prepare input for vocoder model
131
+ inputs = {
132
+ "mels": mel_output,
133
+ }
134
+
135
+ # Run inference
136
+ outputs = mbmelgan.run(None, inputs)
137
+ audio_array = outputs[0][0, :, 0]
138
+
139
+ return audio_array
140
+
141
+ @staticmethod
142
+ def synthesize(text: str) -> np.ndarray:
143
+ mel_output, _ = TTS.text2mel(text)
144
+ audio_array = TTS.mel2wav(mel_output)
145
+ return audio_array
146
+
147
+ @staticmethod
148
+ def save_audio(audio_array: np.ndarray, path: str):
149
+ sf.write(path, audio_array, 44100)
150
+