|
import torch |
|
import torch.nn as nn |
|
import torch.optim as optim |
|
from torch.utils.data import Dataset, DataLoader |
|
import json |
|
|
|
|
|
class SimpleLM(nn.Module): |
|
def __init__(self, vocab_size, embedding_dim, hidden_dim): |
|
super(SimpleLM, self).__init__() |
|
self.embedding = nn.Embedding(vocab_size, embedding_dim) |
|
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) |
|
self.linear = nn.Linear(hidden_dim, vocab_size) |
|
|
|
def forward(self, x, hidden): |
|
embedded = self.embedding(x) |
|
output, hidden = self.lstm(embedded, hidden) |
|
output = self.linear(output) |
|
return output, hidden |
|
|
|
|
|
class CustomDataset(Dataset): |
|
def __init__(self, data_path): |
|
self.data = json.load(open(data_path, 'r')) |
|
|
|
def __len__(self): |
|
return len(self.data) |
|
|
|
def __getitem__(self, idx): |
|
text = self.data[idx] |
|
return torch.tensor(text, dtype=torch.long) |
|
|
|
|
|
vocab_size = 10000 |
|
embedding_dim = 128 |
|
hidden_dim = 256 |
|
batch_size = 32 |
|
num_epochs = 10 |
|
|
|
|
|
lm = SimpleLM(vocab_size, embedding_dim, hidden_dim) |
|
|
|
|
|
dataset = CustomDataset('training_data.json') |
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) |
|
|
|
|
|
criterion = nn.CrossEntropyLoss() |
|
optimizer = optim.Adam(lm.parameters(), lr=0.001) |
|
|
|
|
|
for epoch in range(num_epochs): |
|
total_loss = 0 |
|
for batch in dataloader: |
|
optimizer.zero_grad() |
|
input_data = batch[:, :-1] |
|
target = batch[:, 1:] |
|
hidden = None |
|
|
|
output, hidden = lm(input_data, hidden) |
|
output = output.view(-1, vocab_size) |
|
target = target.view(-1) |
|
|
|
loss = criterion(output, target) |
|
loss.backward() |
|
optimizer.step() |
|
|
|
total_loss += loss.item() |
|
|
|
print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}') |
|
|
|
|
|
torch.save(lm.state_dict(), 'simple_lm.pth') |