|
import sentencepiece as spm
|
|
import tensorflow as tf
|
|
import numpy as np
|
|
import os
|
|
import sys
|
|
|
|
class BrT3Model:
|
|
def __init__(self, vocab_size=372, embedding_dim=1024, model_prefix="sentencepiece", max_seq_len=100, model_name="Br-T-3-preview-low"):
|
|
self.vocab_size = vocab_size
|
|
self.embedding_dim = embedding_dim
|
|
self.model_prefix = model_prefix
|
|
self.model_file = f"{self.model_prefix}.model"
|
|
self.max_seq_len = max_seq_len
|
|
self.model_name = model_name
|
|
self.model_path = f"model_{self.model_name}.weights.h5"
|
|
|
|
self.sp = None
|
|
self.embedding_layer = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, mask_zero=True)
|
|
|
|
|
|
self.lstm_layers = [tf.keras.layers.LSTM(10240, return_sequences=True) for _ in range(1)]
|
|
|
|
|
|
self.gru_layers = [tf.keras.layers.GRU(8533, return_sequences=True) for _ in range(1)]
|
|
|
|
|
|
self.transformer_blocks = [
|
|
tf.keras.layers.Dense(512, activation="relu") for _ in range(3 * 5)
|
|
]
|
|
|
|
self.output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
|
|
self.classifier = tf.keras.layers.Dense(1, activation='sigmoid')
|
|
self.model = None
|
|
|
|
def train_tokenizer(self, data_file):
|
|
print(f"🚀 '{data_file}' dosyası ile SentencePiece eğitimi başlıyor...")
|
|
spm.SentencePieceTrainer.train(input=data_file, model_prefix=self.model_prefix, vocab_size=self.vocab_size, model_type="word")
|
|
self.sp = spm.SentencePieceProcessor()
|
|
self.sp.load(self.model_file)
|
|
print(f"✅ Tokenizer eğitildi! VOCAB SIZE: {self.vocab_size}")
|
|
|
|
def create_model(self):
|
|
inputs = tf.keras.Input(shape=(None,))
|
|
embedded = self.embedding_layer(inputs)
|
|
x = embedded
|
|
|
|
|
|
for lstm in self.lstm_layers:
|
|
x = lstm(x)
|
|
|
|
|
|
for gru in self.gru_layers:
|
|
x = gru(x)
|
|
|
|
|
|
for transformer in self.transformer_blocks:
|
|
x = transformer(x)
|
|
|
|
outputs = self.output_layer(x)
|
|
classifier_output = self.classifier(x[:, -1, :])
|
|
|
|
self.model = tf.keras.Model(inputs, [outputs, classifier_output])
|
|
self.model.compile(optimizer=tf.keras.optimizers.Adam(),
|
|
loss=[tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), tf.keras.losses.BinaryCrossentropy()],
|
|
loss_weights=[0.8, 0.2])
|
|
print(f"✅ Model oluşturuldu! Ağırlık dosyası: {self.model_path}")
|
|
|
|
def train(self, dataset, epochs=1000):
|
|
if not self.model:
|
|
self.create_model()
|
|
|
|
for epoch in range(epochs):
|
|
for batch, (inp, tar, is_question) in enumerate(dataset):
|
|
loss_values = self.model.train_on_batch(inp, [tar, is_question])
|
|
loss_main = float(loss_values[0])
|
|
loss_classifier = float(loss_values[1])
|
|
|
|
print(f"🔹 Batch {batch+1}: Loss (Main): {loss_main:.4f}, Loss (Classifier): {loss_classifier:.4f}")
|
|
|
|
if loss_main < 5:
|
|
print(f"✅ Eğitim durduruldu! Batch {batch+1} içinde loss 0.5'in altına düştü.")
|
|
self.model.save_weights(self.model_path)
|
|
print(f"✅ Model ağırlıkları kaydedildi: {self.model_path}")
|
|
return
|
|
|
|
self.model.save_weights(self.model_path)
|
|
print(f"🎯 Model eğitimi tamamlandı ve ağırlıklar kaydedildi: {self.model_path}")
|
|
|
|
def load_model(self):
|
|
if os.path.exists(self.model_path):
|
|
print(f"✅ Kaydedilmiş model bulundu! Ağırlıklar yükleniyor: {self.model_path}")
|
|
self.create_model()
|
|
self.model.load_weights(self.model_path)
|
|
else:
|
|
print("❌ Model ağırlıkları bulunamadı! Modeli eğitmeniz gerekiyor.")
|
|
|
|
def predict(self, input_text):
|
|
if not self.sp:
|
|
self.sp = spm.SentencePieceProcessor()
|
|
self.sp.load(self.model_file)
|
|
|
|
if not self.model:
|
|
self.load_model()
|
|
|
|
|
|
lang = "TR"
|
|
if "| L: EN" in input_text:
|
|
lang = "EN"
|
|
elif "| L: D" in input_text:
|
|
lang = "D"
|
|
|
|
clean_text = input_text.split("| L:")[0].strip()
|
|
tokenized_input = self.sp.encode(clean_text)
|
|
tokenized_input = np.array(tokenized_input).reshape(1, -1)
|
|
|
|
predictions, is_question = self.model.predict(tokenized_input)
|
|
|
|
is_question = (is_question[0][0] > 0.5)
|
|
|
|
if is_question:
|
|
print("🟢 Soru algılandı, soru-cevap modunda çalışılıyor...")
|
|
else:
|
|
print("🔵 Cümle tamamlanıyor...")
|
|
|
|
predicted_token_id = int(np.argmax(predictions[0][-1]))
|
|
predicted_word = self.sp.decode([predicted_token_id])
|
|
|
|
print(f"🔮 Tahmin edilen yanıt: {predicted_word} ({'Soru' if is_question else 'Cümle'}) - {lang}")
|
|
return predicted_word
|
|
|
|
|
|
MODE = 't'
|
|
|
|
brt3 = BrT3Model()
|
|
|
|
if MODE == 't':
|
|
brt3.train_tokenizer("C:\\Users\\bertu\\OneDrive\\Desktop\\1000_Cumle.txt")
|
|
brt3.train_tokenizer("C:\\Users\\bertu\\Downloads\\qa_dataset.txt")
|
|
data = [(np.random.randint(0, 372, (10,)), np.random.randint(0, 372, (10,)), np.random.randint(0, 2, (1,))) for _ in range(100)]
|
|
dataset = tf.data.Dataset.from_generator(lambda: data, output_signature=(
|
|
tf.TensorSpec(shape=(10,), dtype=tf.int32),
|
|
tf.TensorSpec(shape=(10,), dtype=tf.int32),
|
|
tf.TensorSpec(shape=(1,), dtype=tf.float32)
|
|
)).batch(8)
|
|
brt3.train(dataset)
|
|
|
|
elif MODE == 'u':
|
|
brt3.load_model()
|
|
test_sentence = "Bana bir şiir yaz | L: TR"
|
|
brt3.predict(test_sentence)
|
|
|