import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import pandas as pd from transformers import AutoTokenizer import torch.nn.functional as F import random # Tokenizer'ı değiştirme - Türkçe için optimize edilmiş TOKENIZER_NAME = "dbmdz/bert-base-turkish-cased" tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) # Veri setini yükleme dataset_path = "C:\\Users\\bertu\\Downloads\\BrT-t2t_turkish_conversation_complete_100.csv" df = pd.read_csv(dataset_path) # Özel Dataset sınıfı class ChatDataset(Dataset): def __init__(self, dataframe, tokenizer, max_length=256): self.inputs = dataframe['Input'].tolist() self.cots = dataframe['CoT'].tolist() self.outputs = dataframe['Output'].tolist() self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.inputs) def __getitem__(self, idx): input_text = self.inputs[idx] cot_text = self.cots[idx] output_text = self.outputs[idx] input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') cot_encoding = self.tokenizer(cot_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') output_encoding = self.tokenizer(output_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') return { 'input_ids': input_encoding['input_ids'].squeeze(), 'attention_mask': input_encoding['attention_mask'].squeeze(), 'cot_ids': cot_encoding['input_ids'].squeeze(), 'labels': output_encoding['input_ids'].squeeze() } # Transformer Modeli class CustomTransformer(nn.Module): def __init__(self, vocab_size, embed_dim=512, num_heads=8, ff_dim=1024, num_layers=6): super(CustomTransformer, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim) self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers) self.fc = nn.Linear(embed_dim, vocab_size) def forward(self, input_ids, attention_mask): x = self.embedding(input_ids) x = self.transformer(x) x = self.fc(x) return x # Modeli oluşturma ve eğitme def train_model(train_loader, model, optimizer, criterion, device, epochs=30): model.to(device) model.train() for epoch in range(epochs): total_loss = 0 print(f"Epoch {epoch+1} başlıyor...") for batch in train_loader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) optimizer.zero_grad() outputs = model(input_ids, attention_mask) loss = criterion(outputs.transpose(1, 2), labels) # Loss fonksiyonunu düzelttik loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}") torch.save(model.state_dict(), "trained_transformer.pth") print("Model başarıyla kaydedildi!") return model # Modeli yükleme fonksiyonu def load_model(model, device): model.load_state_dict(torch.load("trained_transformer.pth", map_location=device)) model.to(device) model.eval() print("Model başarıyla yüklendi!") return model # Nucleus Sampling (Top-p) ile Yanıt Üretme Fonksiyonu def nucleus_sampling(logits, p=0.9): logits = logits.squeeze(0) # Çok boyutlu tensorü düzleştir sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cumulative_probs > p sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone() sorted_indices_to_remove[:, 0] = False logits[sorted_indices_to_remove] = -float("Inf") probabilities = F.softmax(logits, dim=-1) return torch.multinomial(probabilities, num_samples=1).squeeze() # Yanıt Üretme Fonksiyonu def generate_response(model, tokenizer, input_text, device, max_length=256): model.eval() with torch.no_grad(): input_encoding = tokenizer(input_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device) input_ids = input_encoding['input_ids'] attention_mask = input_encoding['attention_mask'] cot_output = model(input_ids, attention_mask) cot_ids = nucleus_sampling(cot_output, p=0.9).view(-1) cot_text = tokenizer.decode(cot_ids.tolist(), skip_special_tokens=True) output = model(input_ids, attention_mask) output_ids = nucleus_sampling(output, p=0.9).view(-1) output_text = tokenizer.decode(output_ids.tolist(), skip_special_tokens=True) print(f"CoT Çıktısı: {cot_text}") print(f"Model Yanıtı: {output_text}") print("Yanıt üretme tamamlandı!") return cot_text, output_text # Model eğitme ve test etme if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dataset = ChatDataset(df, tokenizer) train_loader = DataLoader(dataset, batch_size=8, shuffle=True) model = CustomTransformer(vocab_size=len(tokenizer), embed_dim=512) optimizer = optim.Adam(model.parameters(), lr=0.0005) criterion = nn.CrossEntropyLoss() model = train_model(train_loader, model, optimizer, criterion, device) model = load_model(model, device) while True: user_input = input("Kullanıcı: ") if user_input.lower() == "çıkış": break generate_response(model, tokenizer, user_input, device)