import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import json # Define a simple LSTM-based language model class SimpleLM(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim): super(SimpleLM, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) self.linear = nn.Linear(hidden_dim, vocab_size) def forward(self, x, hidden): embedded = self.embedding(x) output, hidden = self.lstm(embedded, hidden) output = self.linear(output) return output, hidden # Define a custom dataset class class CustomDataset(Dataset): def __init__(self, data_path): self.data = json.load(open(data_path, 'r')) def __len__(self): return len(self.data) def __getitem__(self, idx): text = self.data[idx] return torch.tensor(text, dtype=torch.long) # Define training parameters vocab_size = 10000 # Example vocabulary size embedding_dim = 128 hidden_dim = 256 batch_size = 32 num_epochs = 10 # Initialize the LM lm = SimpleLM(vocab_size, embedding_dim, hidden_dim) # Load data dataset = CustomDataset('training_data.json') dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # Define loss function and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(lm.parameters(), lr=0.001) # Training loop for epoch in range(num_epochs): total_loss = 0 for batch in dataloader: optimizer.zero_grad() input_data = batch[:, :-1] # Input sequence target = batch[:, 1:] # Target sequence shifted by one hidden = None output, hidden = lm(input_data, hidden) output = output.view(-1, vocab_size) target = target.view(-1) loss = criterion(output, target) loss.backward() optimizer.step() total_loss += loss.item() print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}') # Save the trained LM torch.save(lm.state_dict(), 'simple_lm.pth')