Introducing Mini GPT-1 ~55M parameters
This is a custom decoder-only transformer model (GPT1-style) trained from scratch on Bookcorpus dataset using PyTorch by Dilip Pokhrel.
Model Details
- Architecture: Decoder-only Transformer
- Layers: 6
- Embedding Size: 512
- Heads: 8
- Feedforward Dim: 2048
- Sequence Length: 128
- Vocab Size: 35,000
Tokenizer
Trained using ByteLevelBPETokenizer
from the tokenizers
library.
Inference Example
Run it in google colab. Go to ==> https://colab.research.google.com
# Clone only if not already cloned
import os
if not os.path.exists("mini-gpt1"):
!git clone https://huggingface.co/dilip025/mini-gpt1
# Install dependencies, Uncomment it if you haven't installed
# !pip install torch tokenizers
# Add repo path to Python
import sys
sys.path.append("mini-gpt1")
# Imports
from model_code.decoder_only_transformer import DecoderOnlyTransformer
from tokenizers import ByteLevelBPETokenizer
import torch
# Load tokenizer
tokenizer = ByteLevelBPETokenizer(
"mini-gpt1/vocab.json",
"mini-gpt1/merges.txt",
)
# Model config
vocab_size = 35000
max_len = 128
embed_dim = 512
num_heads = 8
depth = 6
ff_dim = 2048
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model and weights
model = DecoderOnlyTransformer(
vocab_size=vocab_size,
max_len=max_len,
embed_dim=embed_dim,
num_heads=num_heads,
depth=depth,
ff_dim=ff_dim,
).to(device)
state_dict = torch.load("mini-gpt1/pytorch_model.bin", map_location=device)
model.load_state_dict(state_dict)
model.eval()
# ๐ก Your generation function with temperature & top-k
def generate(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50):
model.eval()
device = next(model.parameters()).device
encoding = tokenizer.encode(prompt)
input_ids = torch.tensor([encoding.ids], dtype=torch.long).to(device)
generated = input_ids.clone()
for _ in range(max_length):
logits = model(generated) # [1, T, vocab_size]
next_token_logits = logits[:, -1, :] / temperature
if top_k is not None:
values, indices = torch.topk(next_token_logits, top_k)
mask = torch.full_like(next_token_logits, float('-inf'))
mask.scatter_(1, indices, values)
next_token_logits = mask
probs = torch.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated = torch.cat((generated, next_token), dim=1)
# Optional: stop on [EOS] token
if hasattr(tokenizer, 'token_to_id') and tokenizer.token_to_id('[EOS]') is not None:
if next_token.item() == tokenizer.token_to_id('[EOS]'):
break
return tokenizer.decode(generated[0].tolist())
# ๐ฅ Example inference -- Run this in second cell too see gibberish ;)
prompt = "He told me a story"
output = generate(model, tokenizer, prompt, max_length=100, temperature=1.2, top_k=40)
print("Generated Output:\n", output)
- Downloads last month
- 25
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support