Br-T-t2t-CoT-mini / BrT-cot-t2t-mini.py
Bertug1911's picture
Upload 2 files
aa4bf93 verified
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer
import torch.nn.functional as F
import random
# Tokenizer'ı değiştirme - Türkçe için optimize edilmiş
TOKENIZER_NAME = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
# Veri setini yükleme
dataset_path = "C:\\Users\\bertu\\Downloads\\BrT-t2t_turkish_conversation_complete_100.csv"
df = pd.read_csv(dataset_path)
# Özel Dataset sınıfı
class ChatDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length=256):
self.inputs = dataframe['Input'].tolist()
self.cots = dataframe['CoT'].tolist()
self.outputs = dataframe['Output'].tolist()
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
input_text = self.inputs[idx]
cot_text = self.cots[idx]
output_text = self.outputs[idx]
input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
cot_encoding = self.tokenizer(cot_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
output_encoding = self.tokenizer(output_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
return {
'input_ids': input_encoding['input_ids'].squeeze(),
'attention_mask': input_encoding['attention_mask'].squeeze(),
'cot_ids': cot_encoding['input_ids'].squeeze(),
'labels': output_encoding['input_ids'].squeeze()
}
# Transformer Modeli
class CustomTransformer(nn.Module):
def __init__(self, vocab_size, embed_dim=512, num_heads=8, ff_dim=1024, num_layers=6):
super(CustomTransformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim)
self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
self.fc = nn.Linear(embed_dim, vocab_size)
def forward(self, input_ids, attention_mask):
x = self.embedding(input_ids)
x = self.transformer(x)
x = self.fc(x)
return x
# Modeli oluşturma ve eğitme
def train_model(train_loader, model, optimizer, criterion, device, epochs=30):
model.to(device)
model.train()
for epoch in range(epochs):
total_loss = 0
print(f"Epoch {epoch+1} başlıyor...")
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask)
loss = criterion(outputs.transpose(1, 2), labels) # Loss fonksiyonunu düzelttik
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")
torch.save(model.state_dict(), "trained_transformer.pth")
print("Model başarıyla kaydedildi!")
return model
# Modeli yükleme fonksiyonu
def load_model(model, device):
model.load_state_dict(torch.load("trained_transformer.pth", map_location=device))
model.to(device)
model.eval()
print("Model başarıyla yüklendi!")
return model
# Nucleus Sampling (Top-p) ile Yanıt Üretme Fonksiyonu
def nucleus_sampling(logits, p=0.9):
logits = logits.squeeze(0) # Çok boyutlu tensorü düzleştir
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cumulative_probs > p
sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
sorted_indices_to_remove[:, 0] = False
logits[sorted_indices_to_remove] = -float("Inf")
probabilities = F.softmax(logits, dim=-1)
return torch.multinomial(probabilities, num_samples=1).squeeze()
# Yanıt Üretme Fonksiyonu
def generate_response(model, tokenizer, input_text, device, max_length=256):
model.eval()
with torch.no_grad():
input_encoding = tokenizer(input_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)
input_ids = input_encoding['input_ids']
attention_mask = input_encoding['attention_mask']
cot_output = model(input_ids, attention_mask)
cot_ids = nucleus_sampling(cot_output, p=0.9).view(-1)
cot_text = tokenizer.decode(cot_ids.tolist(), skip_special_tokens=True)
output = model(input_ids, attention_mask)
output_ids = nucleus_sampling(output, p=0.9).view(-1)
output_text = tokenizer.decode(output_ids.tolist(), skip_special_tokens=True)
print(f"CoT Çıktısı: {cot_text}")
print(f"Model Yanıtı: {output_text}")
print("Yanıt üretme tamamlandı!")
return cot_text, output_text
# Model eğitme ve test etme
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = ChatDataset(df, tokenizer)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)
model = CustomTransformer(vocab_size=len(tokenizer), embed_dim=512)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss()
model = train_model(train_loader, model, optimizer, criterion, device)
model = load_model(model, device)
while True:
user_input = input("Kullanıcı: ")
if user_input.lower() == "çıkış":
break
generate_response(model, tokenizer, user_input, device)