Spaces:

stem-content-ai-project
/

swahili-tts-model

Running

App Files Files Community

mosha255 commited on Mar 18

Commit

fc37b9e

unverified ·

1 Parent(s): 43eda61

Initial commit

Browse files

Files changed (13) hide show

.env-sample +3 -0
.gitignore +3 -0
Dockerfile +29 -0
config.py +15 -0
models/lightspeech_processor.json +1 -0
models/lightspeech_quant.onnx +3 -0
models/mbmelgan.onnx +3 -0
outputs/.gitignore +3 -0
outputs/README.md +3 -0
requirements.txt +16 -0
server.py +51 -0
tests/POST_text_to_speech.http +11 -0
tts.py +150 -0

.env-sample ADDED Viewed

	@@ -0,0 +1,3 @@

+APP_PORT=8080
+DEBUG=true
+BASE_URL=http://localhost:8080

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ __pycache__/
2	+
3	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# Use the official Python image from the Docker Hub
+FROM python:3.10-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install git and git-lfs
+RUN apt-get update \
+    && apt-get install -y git git-lfs libsndfile1 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && git lfs install
+# Copy the requirements file into the container
+COPY requirements.txt .
+# Install the required packages
+RUN pip install --no-cache-dir -r requirements.txt \
+    # Install gruut[sw] separately
+    && pip install -f 'https://synesthesiam.github.io/prebuilt-apps/' 'gruut[sw]'
+# Copy the rest of the application code into the container
+COPY . .
+# Expose the port the app runs on
+EXPOSE 8080
+# Run the application
+CMD ["gunicorn", "--bind", "0.0.0.0:8080", "server:app"]

config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# Configuration settings
+class Config:
+    ENVIRONMENT = os.getenv('APP_ENV')
+    DEBUG = os.getenv('DEBUG') != 'false'
+    SECRET = os.getenv('SECRET_KEY')
+    HOST = os.getenv('APP_HOST') or '0.0.0.0'
+    PORT = int(os.getenv('APP_PORT') or 8080)
+    BASE_URL = os.getenv('BASE_URL') or 'http://localhost:8080'

models/lightspeech_processor.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"symbol_to_id": {"@PAD": 0, "@f": 1, "@h": 2, "@i": 3, "@j": 4, "@k": 5, "@l": 6, "@m": 7, "@n": 8, "@p": 9, "@s": 10, "@t": 11, "@t\u0361\u0283": 12, "@u": 13, "@v": 14, "@w": 15, "@x": 16, "@z": 17, "@\u00f0": 18, "@\u014b": 19, "@\u0251": 20, "@\u0253": 21, "@\u0254": 22, "@\u0257": 23, "@\u025b": 24, "@\u0260": 25, "@\u0263": 26, "@\u027e": 27, "@\u0283": 28, "@\u0284": 29, "@\u03b8": 30, "@\u1d50\u0253": 31, "@\u1d51g": 32, "@\u1dacv": 33, "@\u207fz": 34, "@\u207f\u0257": 35, "@\u207f\u0257\u0361\u0292": 36, "!": 37, ",": 38, ".": 39, "?": 40, ";": 41, ":": 42, "@SIL": 43, "@EOS": 44}, "id_to_symbol": {"0": "@PAD", "1": "@f", "2": "@h", "3": "@i", "4": "@j", "5": "@k", "6": "@l", "7": "@m", "8": "@n", "9": "@p", "10": "@s", "11": "@t", "12": "@t\u0361\u0283", "13": "@u", "14": "@v", "15": "@w", "16": "@x", "17": "@z", "18": "@\u00f0", "19": "@\u014b", "20": "@\u0251", "21": "@\u0253", "22": "@\u0254", "23": "@\u0257", "24": "@\u025b", "25": "@\u0260", "26": "@\u0263", "27": "@\u027e", "28": "@\u0283", "29": "@\u0284", "30": "@\u03b8", "31": "@\u1d50\u0253", "32": "@\u1d51g", "33": "@\u1dacv", "34": "@\u207fz", "35": "@\u207f\u0257", "36": "@\u207f\u0257\u0361\u0292", "37": "!", "38": ",", "39": ".", "40": "?", "41": ";", "42": ":", "43": "@SIL", "44": "@EOS"}, "speakers_map": {"sw-TZ-Victoria": 0}, "processor_name": "SwahiliIPAProcessor"}

models/lightspeech_quant.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f9878e0a686f5237d57364e89acda8126a3da7b231453eb4d419492653a366c
+size 4663604

models/mbmelgan.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96ad87ee030197df993242eed4521bc3fe5fda43df778c1144c7ed50252a6bb3
+size 10459516

outputs/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*
+!.gitignore
+!README.md

outputs/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Outputs Folders
2	+
3	+ Place holder folder for model run outputs.

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+Flask==2.3.2
+numpy==1.25.0
+gunicorn==20.1.0
+gruut
+onnxruntime==1.15.1
+soundfile==0.12.1
+IPython==8.12.0
+pydub
+nltk
+python-dotenv
+flask_cors
+gruut[sw] -f https://synesthesiam.github.io/prebuilt-apps
+librosa
+phonemizer
+g2p_id

server.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from flask import Flask, send_from_directory, request, jsonify
+from flask_cors import CORS
+app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+# Import the TTS class
+from tts import TTS
+import numpy as np
+import onnxruntime as ort
+from pathlib import Path
+import datetime
+from config import Config
+@app.route('/')
+def index():
+  return 'Server is up!'
+@app.route('/text_to_speech', methods=['POST'])
+def fingerprint_verify():
+    try:
+        data = request.json
+        input_text = data.get('text')
+        try:
+            audio_array = TTS.generate(input_text)
+            now = datetime.datetime.now()
+            now_str = now.strftime("%Y%m%d_%H%M%S")
+            file_name = f"output_{now_str}.wav"
+            file_path = f"./outputs/{file_name}"
+            # Save the audio to a file
+            TTS.save_audio(audio_array, file_path)
+            audio_url = f"{Config.BASE_URL}/audio/{file_name}";
+            return dict(success=True, audio_url=audio_url)
+        except Exception as e:
+            return dict(success=False, error=str(e))
+    except Exception as e:
+        return dict(success=False, error=str(e))
+@app.route('/audio/<path:path>')
+def send_audio(path):
+    return send_from_directory('outputs', path)
+if __name__ == '__main__':
+    app.run(host=Config.HOST, port=Config.PORT, debug=Config.DEBUG)

tests/POST_text_to_speech.http ADDED Viewed

	@@ -0,0 +1,11 @@

+### Submit text
+POST http://localhost:8080/text_to_speech
+Content-Type: application/json
+{
+  "text": "Kiswahili lugha yangu"
+}
+### Fetch audio
+GET http://localhost:8080/audio/output_20241206_100849.wav

tts.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# tts.py
+from gruut import sentences
+import re
+import numpy as np
+import onnxruntime as ort
+from pathlib import Path
+import json
+import string
+from IPython.display import Audio
+import soundfile as sf
+# Load models
+lightspeech = ort.InferenceSession("./models/lightspeech_quant.onnx")
+mbmelgan = ort.InferenceSession("./models/mbmelgan.onnx")
+lightspeech_processor_config = Path("./models/lightspeech_processor.json")
+with open(lightspeech_processor_config, "r") as f:
+    processor = json.load(f)
+    tokenizer = processor["symbol_to_id"]
+class TTS:
+    @staticmethod
+    def generate(text: str) -> np.ndarray:
+        sections = TTS.split_text(text)
+        audio_sections = TTS.generate_speech_for_sections(sections)
+        concatenated_audio = TTS.concatenate_audio_sections(audio_sections)
+        return concatenated_audio
+    @staticmethod
+    def split_text(text: str) -> list:
+        # Split the text into sentences based on punctuation marks
+        sentences = re.split(r'(?<=[.!?])\s*', text)
+        sections = []
+        for sentence in sentences:
+            # Split each sentence by commas for short pauses
+            parts = re.split(r',\s*', sentence)
+            for i, part in enumerate(parts):
+                sections.append(part.strip())
+                if i < len(parts) - 1:
+                    sections.append('*')  # Short pause marker
+            sections.append('**')  # Long pause marker after each sentence
+        # Remove empty sections
+        sections = [section for section in sections if section]
+        return sections
+    @staticmethod
+    def generate_speech_for_sections(sections: list) -> list:
+        audio_sections = []
+        for section in sections:
+            if section == '**':
+                # Long pause
+                pause_duration = 1.0
+                sample_rate = 44100
+                pause = np.zeros(int(pause_duration * sample_rate))
+                audio_sections.append(pause)
+            elif section == '*':
+                # Short pause
+                pause_duration = 0.4
+                sample_rate = 44100
+                pause = np.zeros(int(pause_duration * sample_rate))
+                audio_sections.append(pause)
+            else:
+                mel_output, durations = TTS.text2mel(section)
+                audio_array = TTS.mel2wav(mel_output)
+                audio_sections.append(audio_array)
+        return audio_sections
+    @staticmethod
+    def concatenate_audio_sections(audio_sections: list) -> np.ndarray:
+        concatenated_audio = np.concatenate(audio_sections)
+        return concatenated_audio
+    @staticmethod
+    def phonemize(word: str) -> str:
+        ipa = []
+        for words in sentences(word, lang="sw"):
+            for word in words:
+                if word.is_major_break or word.is_minor_break:
+                    ipa += [word.text]
+                    continue
+                phonemes = word.phonemes[:]
+                NG_GRAPHEME = "ng'"
+                NG_PRENASALIZED_PHONEME = "ᵑg"
+                NG_PHONEME = "ŋ"
+                if NG_GRAPHEME in word.text:
+                    ng_graphemes = re.findall(f"{NG_GRAPHEME}?", word.text)
+                    ng_phonemes_idx = [i for i, p in enumerate(phonemes) if p == NG_PRENASALIZED_PHONEME]
+                    assert len(ng_graphemes) == len(ng_phonemes_idx)
+                    for i, g in zip(ng_phonemes_idx, ng_graphemes):
+                        phonemes[i] = NG_PHONEME if g == NG_GRAPHEME else phonemes[i]
+                ipa += phonemes
+        return ipa
+    @staticmethod
+    def tokenize(phonemes):
+        input_ids = []
+        for phoneme in phonemes:
+            if all(c in string.punctuation for c in phoneme):
+                input_ids.append(tokenizer[phoneme])
+            else:
+                input_ids.append(tokenizer[f"@{phoneme}"])
+        return input_ids
+    @staticmethod
+    def text2mel(text: str) -> tuple:
+        phonemes = TTS.phonemize(text)
+        input_ids = TTS.tokenize(phonemes)
+        inputs = {
+            "input_ids": np.array([input_ids], dtype=np.int32),
+            "speaker_ids": np.array([0], dtype=np.int32),
+            "speed_ratios": np.array([1.0], dtype=np.float32),
+            "f0_ratios":  np.array([1.0], dtype=np.float32),
+            "energy_ratios": np.array([1.0], dtype=np.float32),
+        }
+        mel_output, durations, _ = lightspeech.run(None, inputs)
+        return mel_output, durations
+    @staticmethod
+    def mel2wav(mel_output: np.ndarray) -> np.ndarray:
+        # Prepare input for vocoder model
+        inputs = {
+            "mels": mel_output,
+        }
+        # Run inference
+        outputs = mbmelgan.run(None, inputs)
+        audio_array = outputs[0][0, :, 0]
+        return audio_array
+    @staticmethod
+    def synthesize(text: str) -> np.ndarray:
+        mel_output, _ = TTS.text2mel(text)
+        audio_array = TTS.mel2wav(mel_output)
+        return audio_array
+    @staticmethod
+    def save_audio(audio_array: np.ndarray, path: str):
+        sf.write(path, audio_array, 44100)