Introducing Mini GPT-1 ~55M parameters

This is a custom decoder-only transformer model (GPT1-style) trained from scratch on Bookcorpus dataset using PyTorch by Dilip Pokhrel.

Model Details

Architecture: Decoder-only Transformer
Layers: 6
Embedding Size: 512
Heads: 8
Feedforward Dim: 2048
Sequence Length: 128
Vocab Size: 35,000

Tokenizer

Trained using ByteLevelBPETokenizer from the tokenizers library.

Inference Example

Run it in google colab. Go to ==> https://colab.research.google.com

# Clone only if not already cloned
import os
if not os.path.exists("mini-gpt1"):
    !git clone https://huggingface.co/dilip025/mini-gpt1

# Install dependencies, Uncomment it if you haven't installed
# !pip install torch tokenizers

# Add repo path to Python
import sys
sys.path.append("mini-gpt1")

# Imports
from model_code.decoder_only_transformer import DecoderOnlyTransformer
from tokenizers import ByteLevelBPETokenizer
import torch

# Load tokenizer
tokenizer = ByteLevelBPETokenizer(
    "mini-gpt1/vocab.json",
    "mini-gpt1/merges.txt",
)

# Model config
vocab_size = 35000
max_len = 128
embed_dim = 512
num_heads = 8
depth = 6
ff_dim = 2048

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and weights
model = DecoderOnlyTransformer(
    vocab_size=vocab_size,
    max_len=max_len,
    embed_dim=embed_dim,
    num_heads=num_heads,
    depth=depth,
    ff_dim=ff_dim,
).to(device)

state_dict = torch.load("mini-gpt1/pytorch_model.bin", map_location=device)
model.load_state_dict(state_dict)
model.eval()

# 💡 Your generation function with temperature & top-k
def generate(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50):
    model.eval()
    device = next(model.parameters()).device

    encoding = tokenizer.encode(prompt)
    input_ids = torch.tensor([encoding.ids], dtype=torch.long).to(device)
    generated = input_ids.clone()

    for _ in range(max_length):
        logits = model(generated)  # [1, T, vocab_size]
        next_token_logits = logits[:, -1, :] / temperature

        if top_k is not None:
            values, indices = torch.topk(next_token_logits, top_k)
            mask = torch.full_like(next_token_logits, float('-inf'))
            mask.scatter_(1, indices, values)
            next_token_logits = mask

        probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)

        generated = torch.cat((generated, next_token), dim=1)

        # Optional: stop on [EOS] token
        if hasattr(tokenizer, 'token_to_id') and tokenizer.token_to_id('[EOS]') is not None:
            if next_token.item() == tokenizer.token_to_id('[EOS]'):
                break

    return tokenizer.decode(generated[0].tolist())


# 🔥 Example inference -- Run this in second cell too see gibberish ;)
prompt = "He told me a story"
output = generate(model, tokenizer, prompt, max_length=100, temperature=1.2, top_k=40)
print("Generated Output:\n", output)

dilip025
/

mini-gpt1

Introducing Mini GPT-1 ~55M parameters

Model Details

Tokenizer

Inference Example

Dataset used to train dilip025/mini-gpt1