TextToCloneSpeech / train.py
Emmylahot12's picture
Create train.py
745a2c1 verified
import pandas as pd
import tensorflow as tf
import numpy as np
import librosa
import os
# === CONFIG ===
DATA_PATH = "data/transcriptions.csv"
AUDIO_DIR = "data"
MODEL_PATH = "model/clone_tts_model.h5"
SAMPLE_RATE = 22050
TEXT_MAX_LEN = 100 # Max characters per text
# === Load and preprocess dataset ===
def load_data():
data = pd.read_csv(DATA_PATH)
texts = data['text'].values
audio_arrays = []
for file in data['file']:
audio_path = os.path.join(AUDIO_DIR, file)
y, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
audio_arrays.append(y)
max_audio_len = max(len(a) for a in audio_arrays)
padded_audios = np.array([np.pad(a, (0, max_audio_len - len(a))) for a in audio_arrays])
padded_texts = np.array([
[ord(c) for c in text.ljust(TEXT_MAX_LEN)[:TEXT_MAX_LEN]] for text in texts
])
return padded_texts, padded_audios, max_audio_len
# === Build and train model ===
def train_model():
print("Loading and preparing data...")
X, y, audio_len = load_data()
print("Building model...")
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(TEXT_MAX_LEN,)),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(audio_len)
])
model.compile(optimizer='adam', loss='mse')
print("Training...")
model.fit(X, y, epochs=10, batch_size=4)
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
model.save(MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")
if __name__ == "__main__":
train_model()