|
import torch
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
from torch.utils.data import Dataset, DataLoader
|
|
import pandas as pd
|
|
from transformers import AutoTokenizer
|
|
import torch.nn.functional as F
|
|
import random
|
|
|
|
|
|
TOKENIZER_NAME = "dbmdz/bert-base-turkish-cased"
|
|
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
|
|
|
|
|
|
dataset_path = "C:\\Users\\bertu\\Downloads\\BrT-t2t_turkish_conversation_complete_100.csv"
|
|
df = pd.read_csv(dataset_path)
|
|
|
|
|
|
class ChatDataset(Dataset):
|
|
def __init__(self, dataframe, tokenizer, max_length=256):
|
|
self.inputs = dataframe['Input'].tolist()
|
|
self.cots = dataframe['CoT'].tolist()
|
|
self.outputs = dataframe['Output'].tolist()
|
|
self.tokenizer = tokenizer
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.inputs)
|
|
|
|
def __getitem__(self, idx):
|
|
input_text = self.inputs[idx]
|
|
cot_text = self.cots[idx]
|
|
output_text = self.outputs[idx]
|
|
|
|
input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
|
cot_encoding = self.tokenizer(cot_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
|
output_encoding = self.tokenizer(output_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
|
|
|
return {
|
|
'input_ids': input_encoding['input_ids'].squeeze(),
|
|
'attention_mask': input_encoding['attention_mask'].squeeze(),
|
|
'cot_ids': cot_encoding['input_ids'].squeeze(),
|
|
'labels': output_encoding['input_ids'].squeeze()
|
|
}
|
|
|
|
|
|
class CustomTransformer(nn.Module):
|
|
def __init__(self, vocab_size, embed_dim=512, num_heads=8, ff_dim=1024, num_layers=6):
|
|
super(CustomTransformer, self).__init__()
|
|
|
|
self.embedding = nn.Embedding(vocab_size, embed_dim)
|
|
self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim)
|
|
self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
|
|
self.fc = nn.Linear(embed_dim, vocab_size)
|
|
|
|
def forward(self, input_ids, attention_mask):
|
|
x = self.embedding(input_ids)
|
|
x = self.transformer(x)
|
|
x = self.fc(x)
|
|
return x
|
|
|
|
|
|
def train_model(train_loader, model, optimizer, criterion, device, epochs=30):
|
|
model.to(device)
|
|
model.train()
|
|
|
|
for epoch in range(epochs):
|
|
total_loss = 0
|
|
print(f"Epoch {epoch+1} başlıyor...")
|
|
for batch in train_loader:
|
|
input_ids = batch['input_ids'].to(device)
|
|
attention_mask = batch['attention_mask'].to(device)
|
|
labels = batch['labels'].to(device)
|
|
|
|
optimizer.zero_grad()
|
|
outputs = model(input_ids, attention_mask)
|
|
loss = criterion(outputs.transpose(1, 2), labels)
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
total_loss += loss.item()
|
|
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")
|
|
|
|
torch.save(model.state_dict(), "trained_transformer.pth")
|
|
print("Model başarıyla kaydedildi!")
|
|
return model
|
|
|
|
|
|
def load_model(model, device):
|
|
model.load_state_dict(torch.load("trained_transformer.pth", map_location=device))
|
|
model.to(device)
|
|
model.eval()
|
|
print("Model başarıyla yüklendi!")
|
|
return model
|
|
|
|
|
|
def nucleus_sampling(logits, p=0.9):
|
|
logits = logits.squeeze(0)
|
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
|
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
|
|
|
sorted_indices_to_remove = cumulative_probs > p
|
|
sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
|
|
sorted_indices_to_remove[:, 0] = False
|
|
|
|
logits[sorted_indices_to_remove] = -float("Inf")
|
|
probabilities = F.softmax(logits, dim=-1)
|
|
|
|
return torch.multinomial(probabilities, num_samples=1).squeeze()
|
|
|
|
|
|
def generate_response(model, tokenizer, input_text, device, max_length=256):
|
|
model.eval()
|
|
with torch.no_grad():
|
|
input_encoding = tokenizer(input_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)
|
|
input_ids = input_encoding['input_ids']
|
|
attention_mask = input_encoding['attention_mask']
|
|
|
|
cot_output = model(input_ids, attention_mask)
|
|
cot_ids = nucleus_sampling(cot_output, p=0.9).view(-1)
|
|
cot_text = tokenizer.decode(cot_ids.tolist(), skip_special_tokens=True)
|
|
|
|
output = model(input_ids, attention_mask)
|
|
output_ids = nucleus_sampling(output, p=0.9).view(-1)
|
|
output_text = tokenizer.decode(output_ids.tolist(), skip_special_tokens=True)
|
|
|
|
print(f"CoT Çıktısı: {cot_text}")
|
|
print(f"Model Yanıtı: {output_text}")
|
|
print("Yanıt üretme tamamlandı!")
|
|
|
|
return cot_text, output_text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
dataset = ChatDataset(df, tokenizer)
|
|
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)
|
|
|
|
model = CustomTransformer(vocab_size=len(tokenizer), embed_dim=512)
|
|
optimizer = optim.Adam(model.parameters(), lr=0.0005)
|
|
criterion = nn.CrossEntropyLoss()
|
|
|
|
model = train_model(train_loader, model, optimizer, criterion, device)
|
|
model = load_model(model, device)
|
|
|
|
while True:
|
|
user_input = input("Kullanıcı: ")
|
|
if user_input.lower() == "çıkış":
|
|
break
|
|
generate_response(model, tokenizer, user_input, device)
|
|
|