|
import os |
|
import json |
|
import numpy as np |
|
import scipy.io.wavfile as wavfile |
|
from onnxruntime import InferenceSession |
|
from phonemizer import phonemize |
|
|
|
|
|
CONFIG_PATH = "./config_kokoro.json" |
|
with open(CONFIG_PATH, "r", encoding="utf-8") as f: |
|
config = json.load(f) |
|
phoneme_to_id = config["vocab"] |
|
|
|
|
|
text = "Hi how are you, what is your name. tell me something" |
|
|
|
phonemes = phonemize( |
|
text, |
|
language="en-us", |
|
backend="espeak", |
|
strip=True, |
|
preserve_punctuation=True, |
|
with_stress=True |
|
) |
|
|
|
|
|
phonemes = "".join(p for p in phonemes if p in phoneme_to_id) |
|
print("Phonemes:", phonemes) |
|
|
|
tokens = [phoneme_to_id[p] for p in phonemes] |
|
print("Token IDs:", tokens) |
|
|
|
|
|
assert len(tokens) <= 510, "Token sequence too long (max 510 phonemes)" |
|
|
|
voices = np.fromfile('./voices/af.bin', dtype=np.float32).reshape(-1, 1, 256) |
|
ref_s = voices[len(tokens)] |
|
|
|
tokens = [[0, *tokens, 0]] |
|
|
|
|
|
model_name = 'model.onnx' |
|
sess = InferenceSession(os.path.join('onnx', model_name)) |
|
|
|
audio = sess.run(None, { |
|
'input_ids': tokens, |
|
'style': ref_s, |
|
'speed': np.ones(1, dtype=np.float32), |
|
})[0] |
|
|
|
|
|
wavfile.write('audio.wav', 24000, audio[0]) |
|
print("✅ Audio saved to audio.wav") |
|
|