2F-AI/Muffin-2.7 · Hugging Face

I barely remember something about this Muffin version but its okay. It has 5.8M parameters. And its a LSMT.
datasets: A book, i dont remember.
code, here:
################################################################
#   Muffin V5.7l -- VERSION 5 large (code name: Elizabeth)     #
#   Now more BIG (5.8M)                                        #
################################################################

import os
import random
from typing import List

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


class CorpusDataset(Dataset):
    def __init__(self, data: List[str], seq_length: int):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        input_seq = self.data[index:index + self.seq_length]
        target_seq = self.data[index + 1:index + self.seq_length + 1]
        return torch.tensor(input_seq), torch.tensor(target_seq)


class TextGeneratorNN(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, num_layers: int):
        super(TextGeneratorNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden


class TextGenerator:
    def __init__(self, corpus_path: str, seq_length: int = 20, embedding_dim: int = 128, hidden_dim: int = 256, num_layers: int = 2) -> None:
        self.seq_length = seq_length
        self.corpus = self.load_corpus(corpus_path)
        self.words = self.split_words(self.corpus)
        self.vocab = list(set(self.words))  # Unique words
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}

        self.model = TextGeneratorNN(len(self.vocab), embedding_dim, hidden_dim, num_layers)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.CrossEntropyLoss()

        # Prepare dataset and dataloader
        corpus_indices = [self.word_to_idx[word] for word in self.words]
        self.dataset = CorpusDataset(corpus_indices, self.seq_length)
        self.dataloader = DataLoader(self.dataset, batch_size=64, shuffle=True)

        # Directory for saving/loading model
        self.model_path = 'Models/V5/model-main.pth'
        self.training_dir = 'Models/V5'

        # Ensure the directory exists
        if not os.path.exists(self.training_dir):
            os.makedirs(self.training_dir)

        # Check if the model file exists
        if os.path.exists(self.model_path):
            print("Loading saved model from:", self.model_path)
            self.load_model()
        else:
            print("No saved model found. Training from scratch.")

    def load_corpus(self, file_path: str) -> str:
        """Load the corpus from a file."""
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()

    def split_words(self, input_text: str) -> List[str]:
        """Split a string into words."""
        return input_text.split()

    def train(self, epochs: int = 10) -> None:
        """Train the neural network."""
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            for input_seq, target_seq in self.dataloader:
                input_seq, target_seq = input_seq.long(), target_seq.long()
                self.optimizer.zero_grad()

                output, _ = self.model(input_seq)
                loss = self.loss_fn(output.view(-1, len(self.vocab)), target_seq.view(-1))
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(self.dataloader)}")

        # Save the model after training
        print("Saving trained model to:", self.model_path)
        self.save_model()

    def generate(self, start_words: str, length: int, temperature: float) -> str:
        self.model.eval()

        current_words = start_words.split()
        input_seq = torch.tensor([self.word_to_idx[word] for word in current_words]).unsqueeze(0)

        hidden = None
        result = current_words[:]

        for _ in range(length):
            with torch.no_grad():
                output, hidden = self.model(input_seq, hidden)

            probabilities = torch.softmax(output[:, -1, :] / temperature, dim=-1).squeeze()
            next_word_idx = torch.multinomial(probabilities, 1).item()
            next_word = self.idx_to_word[next_word_idx]

            result.append(next_word)
            input_seq = torch.tensor([next_word_idx]).unsqueeze(0)

        # Continue generating until we hit punctuation after reaching the length limit
        while not self.ends_with_punctuation(result[-1]):
            with torch.no_grad():
                output, hidden = self.model(input_seq, hidden)

            probabilities = torch.softmax(output[:, -1, :] / temperature, dim=-1).squeeze()
            next_word_idx = torch.multinomial(probabilities, 1).item()
            next_word = self.idx_to_word[next_word_idx]

            result.append(next_word)
            input_seq = torch.tensor([next_word_idx]).unsqueeze(0)

        return ' '.join(result)

    @staticmethod
    def ends_with_punctuation(word: str) -> bool:
        """Check if the word ends with punctuation."""
        return word[-1] in {'.', '!', '?'}

    def get_random_starting_words(self, word_count: int = 2) -> str:
        """Select random starting words that exist in the corpus."""
        if len(self.words) < word_count:
            raise ValueError("Not enough words in the corpus for starting sequence.")
        start_index = random.randint(0, len(self.words) - word_count)
        return ' '.join(self.words[start_index:start_index + word_count])

    def save_model(self):
        """Save the trained model and optimizer state."""
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'vocab': self.vocab,
            'word_to_idx': self.word_to_idx,
            'idx_to_word': self.idx_to_word,
        }, self.model_path)

    def load_model(self):
        """Load the saved model and optimizer state."""
        checkpoint = torch.load(self.model_path, map_location=torch.device('cpu'))  # Add map_location
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.vocab = checkpoint['vocab']
        self.word_to_idx = checkpoint['word_to_idx']
        self.idx_to_word = checkpoint['idx_to_word']

    def save_generated_text(self, text: str, file_path: str = './SaveGeneratedText.txt') -> None:
        """Save the generated text to a specified file."""
        with open(file_path, 'a', encoding='utf-8') as file:
            file.write(text + '\n')  # Append the text followed by a newline


# Use the larger corpus dataset (dataset-4.txt)
corpus_file_path = 'Snapshots/Datasets/dataset-5-large.txt'

# Initialize the text generator with the LSTM model
generator = TextGenerator(corpus_file_path)

# If model doesn't exist, train the neural network model (adjust epochs as needed)
if not os.path.exists(generator.model_path):
    generator.train(epochs=50)

# Loop to generate text until the user decides to save it
while True:
    # Randomly select starting words from the dataset
    start_words = generator.get_random_starting_words(word_count=3)
    length = 50  # Length of the generated text
    temperature = 0.835  # Adjust the randomness (0.835)

    # Generate text starting with the randomly selected start_words
    generated_text = generator.generate(start_words, length, temperature)

    print("Starting Words: " + start_words)
    print("Generated Text: " + generated_text)

    # Prompt to save the generated text
    save_choice = input(">> Do you want to save the generated text? (yes/no/cancel/stop): ").strip().lower()
    if save_choice == 'yes':
        generator.save_generated_text(generated_text)
        print("Generated text saved to './SaveGeneratedText.txt'.")

    elif save_choice == 'no':
        print("Generating a new text...")
    elif save_choice in ('cancel', 'stop'):
        print("Operation cancelled.")
        break
    else:
        print("Invalid input. Please respond with 'yes', 'no' or 'cancel'/'stop'.")