|
import pandas as pd |
|
import tensorflow as tf |
|
import numpy as np |
|
import librosa |
|
import os |
|
|
|
|
|
DATA_PATH = "data/transcriptions.csv" |
|
AUDIO_DIR = "data" |
|
MODEL_PATH = "model/clone_tts_model.h5" |
|
SAMPLE_RATE = 22050 |
|
TEXT_MAX_LEN = 100 |
|
|
|
|
|
def load_data(): |
|
data = pd.read_csv(DATA_PATH) |
|
texts = data['text'].values |
|
audio_arrays = [] |
|
|
|
for file in data['file']: |
|
audio_path = os.path.join(AUDIO_DIR, file) |
|
y, _ = librosa.load(audio_path, sr=SAMPLE_RATE) |
|
audio_arrays.append(y) |
|
|
|
max_audio_len = max(len(a) for a in audio_arrays) |
|
padded_audios = np.array([np.pad(a, (0, max_audio_len - len(a))) for a in audio_arrays]) |
|
|
|
padded_texts = np.array([ |
|
[ord(c) for c in text.ljust(TEXT_MAX_LEN)[:TEXT_MAX_LEN]] for text in texts |
|
]) |
|
|
|
return padded_texts, padded_audios, max_audio_len |
|
|
|
|
|
def train_model(): |
|
print("Loading and preparing data...") |
|
X, y, audio_len = load_data() |
|
|
|
print("Building model...") |
|
model = tf.keras.Sequential([ |
|
tf.keras.layers.Input(shape=(TEXT_MAX_LEN,)), |
|
tf.keras.layers.Dense(256, activation='relu'), |
|
tf.keras.layers.Dense(audio_len) |
|
]) |
|
|
|
model.compile(optimizer='adam', loss='mse') |
|
print("Training...") |
|
model.fit(X, y, epochs=10, batch_size=4) |
|
|
|
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True) |
|
model.save(MODEL_PATH) |
|
print(f"Model saved to {MODEL_PATH}") |
|
|
|
if __name__ == "__main__": |
|
train_model() |