# Gemma 3 270M
https://developers.googleblog.com/en/introducing-gemma-3-270m/

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
import os
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Loading Model and tokenizer

In [None]:
model_id = "google/gemma-3-270m-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16, # Use if your hardware supports bfloat16
    )

In [None]:
model.eval()

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((640,), eps

In [None]:
model.device

device(type='cpu')

## Generating text

### function to take in prompt and return response

In [None]:
def run_model(max_tokens=100, model='google/gemma-3-270m-it'):
  # loading model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model)
  model = AutoModelForCausalLM.from_pretrained(
        model,
        device_map="auto",
        torch_dtype=torch.bfloat16, # Use if your hardware supports bfloat16
    )

  while True:
    prompt = input("Enter prompt (or 'q' to quit): ").strip()
    if prompt.lower() == 'q':
        break

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    generated_output = model.generate(**inputs, max_new_tokens=max_tokens)
    decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    print(decoded_output, end="\n")

In [None]:
# using pipeline from transformers
from transformers import pipeline

def run_model_pipeline(max_tokens=100, model='google/gemma-3-270m-it'):
  # Load text-generation pipeline
  generator = pipeline("text-generation", model=model, device=-1)  # device=0 for GPU, -1 for CPU

  while True:
      prompt = input("Enter prompt (or 'q' to quit): ").strip()
      if prompt.lower() == 'q':
          break

      # Generate text
      outputs = generator(prompt, max_new_tokens=max_tokens)
      print(outputs[0]['generated_text'], end="\n")


In [None]:
run_model_pipeline()

Device set to use cpu


Enter prompt (or 'q' to quit): Hello
Hello there!

I'm happy to answer any questions you might have. Please feel free to reach out.

Best regards!
[Your Name]
[Your Contact Information]

Enter prompt (or 'q' to quit): who are you?
who are you?

I'm a new program, I'm excited to learn and contribute. I'm ready to help you with your tasks.

What can I do for you today?
I'm excited to learn and contribute. I'm ready to help you with your tasks.

What are you looking for?
I'm looking for a new program that is a great value and has a lot of features.
I am looking for a program that has a good user experience and
Enter prompt (or 'q' to quit): do you have a name? what do i call you by?
do you have a name? what do i call you by?
I'm looking for a name that's unique and memorable, and has a touch of personality.

What are some words that could be used in the context of a wedding?
I'm looking for a name that's unique and memorable, and has a touch of personality.

I'm looking for a name that's 

### Checking on unstruct data to return json

In [None]:
run_model_pipeline()

Device set to use cpu


Enter prompt (or 'q' to quit): Convert the following into valid, well-formatted JSON. Name: Alice Johnson | Age: 29 | Email - alice.johnson@email.com Lives in New York, works as a product manager. Phone: (555) 123-4567 Hobbies include reading, cycling and cooking.
Convert the following into valid, well-formatted JSON. Name: Alice Johnson | Age: 29 | Email - alice.johnson@email.com Lives in New York, works as a product manager. Phone: (555) 123-4567 Hobbies include reading, cycling and cooking.
```json
{
  "Alice": {
    "age": 29,
    "email": "alice.johnson@email.com",
    "lives": "New York",
    "phone": "(555) 123-4567"
  },
  "Alice_ProductManager": {
    "age": 29,
    "email": "alice.johnson@email.com",
Enter prompt (or 'q' to quit): q


## Generating with optimal config and streamer

In [None]:
from transformers import pipeline, TextStreamer

def run_model_stream(max_tokens=100, model='google/gemma-3-270m-it'):
  generator = pipeline(
      "text-generation",
      model=model,
      device=-1,  # GPU=0; -1 CPU
  )

  while True:
      prompt = input("Enter prompt (or 'q' to quit): ").strip()
      if prompt.lower() == 'q':
          break

      streamer = TextStreamer(generator.tokenizer)

      generator(
          prompt,
          max_new_tokens=max_tokens,
          do_sample=True,
          temperature=1.0,
          top_k=64,
          top_p=0.95,
          streamer=streamer
      )

In [None]:
run_model_stream()

Device set to use cpu


Enter prompt (or 'q' to quit): Hey there? I'm Toheed how are you?
<bos>Hey there? I'm Toheed how are you? How can I help you today?
Hi Toheed, I hope you are doing well.
Here's a message to you:
Toheed, I'm here for you! If you need anything, just let me know.
Thank you for your time.
Toheed, I appreciate your kind words.
Hi Toheed, I hope you are doing well.
Here's a message to you:
Hi Toheed, I hope you are doing well.

Enter prompt (or 'q' to quit): Convert the following into valid, well-formatted JSON. Name: Alice Johnson | Age: 29 | Email - alice.johnson@email.com Lives in New York, works as a product manager. Phone: (555) 123-4567 Hobbies include reading, cycling and cooking.
<bos>Convert the following into valid, well-formatted JSON. Name: Alice Johnson | Age: 29 | Email - alice.johnson@email.com Lives in New York, works as a product manager. Phone: (555) 123-4567 Hobbies include reading, cycling and cooking.
```json
{
"Alice Johnson": {
    "age": 29,
    "email": "alice.johnso

## Testing with other languages

In [None]:
run_model_stream()

Device set to use cpu


Enter prompt (or 'q' to quit): आप कैसे हैं
<bos>आप कैसे हैं? क्या आपको कोई चिंता है?

मैं यहाँ हूँ।

**निम्नलिखित प्रश्नों के उत्तर देने के लिए मैं यहाँ हूँ:**

1.  **क्या आप किसी खास चीज़ से जानते हैं?** (जैसे, किसी ऐसे विषय से संबंधित किसी विशेष चीज, किसी विषय से संबंधित किसी विशेष चीज, या किसी विशेष विषय से संबंधित किसी विशेष चीज)
2.  **क्या आप किसी ऐसे विषय से संबंधित हैं जो आपको परेशान करता है?**
3.  **क्या आप किसी ऐसे विषय
Enter prompt (or 'q' to quit): q


# Fine tuning (PEFT, LoRA)
using dataset with small genz convos

## finetuning fp32

### Loading dataset

In [None]:
from datasets import load_dataset

In [None]:
dataset_path = "/content/indian_genz_chatt_dataset_500.json"
dataset = load_dataset("json", data_files=dataset_path)['train']
dataset

In [3]:
dataset[1]

{'topic': 'Job Market & The Grind',
 'prompt': 'LinkedIn pe connections badhane ke liye fake enthusiasm dikhani pad rahi hai.',
 'response': 'Corporate facade maintain karna exhausting hai yaar, but game khelna padta hai. no cap'}

In [4]:
def format_example(example):
    return {
        "text": (
            f"<start_of_turn>user\n{example['prompt']}<end_of_turn>\n"
            f"<start_of_turn>model\n{example['response']}<end_of_turn>"
        )
    }

gemma3_dataset = dataset.map(format_example, remove_columns=dataset.column_names)

In [5]:
split_dataset = gemma3_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [6]:
len(train_dataset), len(eval_dataset)

(108, 12)

In [7]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 108
})

### preparing for training (fp32)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = "google/gemma-3-270m-it"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    attn_implementation="eager",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

print(f"Device: {model.device}")
print(f"DType: {model.dtype}")

Device: cpu
DType: torch.float32


In [None]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 15,187,968 || all params: 283,286,144 || trainable%: 5.3614


In [None]:
!pip install trl

Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = peft_model,
    train_dataset = gemma3_dataset,
    args = SFTConfig(
      dataset_text_field="text",
      per_device_train_batch_size=32,
      gradient_accumulation_steps=6,
      warmup_steps=10,
      num_train_epochs=3,
      learning_rate=2e-5,
      logging_strategy="steps",
      logging_steps=1,
      optim="adamw_torch_fused",
      weight_decay=0.01,
      lr_scheduler_type="linear",
      seed=3407,
      fp16=False,
      output_dir="outputs",
      report_to="none",
  )

)

In [None]:
trainer.train() # after a few iterations

Step,Training Loss
1,1.4347
2,1.4343
3,1.429


TrainOutput(global_step=3, training_loss=1.432644248008728, metrics={'train_runtime': 18.9818, 'train_samples_per_second': 18.966, 'train_steps_per_second': 0.158, 'total_flos': 12741654491136.0, 'train_loss': 1.432644248008728})

### saving model fp32

In [None]:
trainer.model.save_pretrained("gemma-genz-270M-peft")
tokenizer.save_pretrained("gemma-genz-270M-peft")

('gemma-genz-270M-peft/tokenizer_config.json',
 'gemma-genz-270M-peft/special_tokens_map.json',
 'gemma-genz-270M-peft/chat_template.jinja',
 'gemma-genz-270M-peft/tokenizer.model',
 'gemma-genz-270M-peft/added_tokens.json',
 'gemma-genz-270M-peft/tokenizer.json')

In [None]:
# saving to local
import shutil

# Path to your saved model
model_folder = "gemma-genz-270M-peft"
zip_name = "gemma-genz-270M-peft.zip"

# Create zip
shutil.make_archive("gemma-genz-270M-peft", 'zip', model_folder)

'/content/gemma-genz-270M-peft.zip'

In [None]:
from google.colab import files

files.download(zip_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Inference fp32

In [47]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

device = 0 if torch.cuda.is_available() else -1  # pipeline uses int device

base_model_name = "google/gemma-3-270m-it"
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32)

peft_model_path = "/content/gemma-genz-270M-peft"
model = PeftModel.from_pretrained(base_model, peft_model_path)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device
)

def generate_text_pipeline(prompts, max_new_tokens=100, temperature=1.0, top_k=50, top_p=0.95, do_sample=True):
    """
    Generate text using the preloaded PEFT LoRA model via Hugging Face pipeline.

    Args:
        prompts (str or list[str]): Single prompt or list of prompts.
        max_new_tokens (int): Maximum tokens to generate beyond input.
        temperature (float): Sampling temperature.
        top_k (int): Top-k sampling.
        top_p (float): Top-p sampling (nucleus).
        do_sample (bool): Whether to sample or use greedy decoding.

    Returns:
        list[str]: Generated text(s).
    """
    if isinstance(prompts, str):
        prompts = [prompts]

    outputs = text_gen_pipeline(
        prompts,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=do_sample
    )

    return outputs

Device set to use cuda:0


In [51]:
prompt = "job market is dead bro!"
generated = generate_text_pipeline(prompt)
print(generated)

[[{'generated_text': 'job market is dead bro! #opportunityseek #careergoals #adultingNOW #worklifebalance\nHarhar ke liye job search kar raha hoon, identity crisis mein.'}]]


In [48]:
prompt = "life seems to be slipping away from the hands yaar"
generated = generate_text_pipeline(prompt)
generated

[[{'generated_text': "life seems to be slipping away from the hands yaar. Everything feels unpredictable, anxiety peaks and dips hard.\n\nI'm so sorry to hear it. Sometimes it feels like I'm just going through the motions."}]]

## finetune fp16

### preparing for finetuning using loaded dataset

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = "google/gemma-3-270m-it"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    attn_implementation="eager",
    torch_dtype=torch.float16,
).to(device)
tokenizer = AutoTokenizer.from_pretrained(base_model)

print(f"Device: {model.device}")
print(f"DType: {model.dtype}")

Device: cuda:0
DType: torch.float16


In [9]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

In [10]:
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 15,187,968 || all params: 283,286,144 || trainable%: 5.3614


In [34]:
!pip install trl

Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [14]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = peft_model,
    train_dataset = gemma3_dataset,
    args = SFTConfig(
      dataset_text_field="text",
      per_device_train_batch_size=32,
      gradient_accumulation_steps=6,
      warmup_steps=10,
      num_train_epochs=10,
      learning_rate=5e-5,
      logging_strategy="steps",
      logging_steps=1,
      optim="adamw_torch_fused",
      weight_decay=0.01,
      lr_scheduler_type="linear",
      seed=3407,
      fp16=False,
      output_dir="outputs",
      report_to="none",
  )

)

In [29]:
trainer.train()

Step,Training Loss
1,1.106
2,1.1079
3,1.0958
4,1.075
5,1.0536
6,1.0418
7,1.0287
8,1.0071
9,0.9737
10,0.9331


TrainOutput(global_step=10, training_loss=1.042256236076355, metrics={'train_runtime': 52.4478, 'train_samples_per_second': 22.88, 'train_steps_per_second': 0.191, 'total_flos': 42488815650816.0, 'train_loss': 1.042256236076355})

In [30]:
trainer.train() # 8-9th iteration so 80-90 epochs

Step,Training Loss
1,0.8975
2,0.8923
3,0.885
4,0.8677
5,0.8394
6,0.8249
7,0.8131
8,0.7989
9,0.7708
10,0.7378


TrainOutput(global_step=10, training_loss=0.8327455043792724, metrics={'train_runtime': 53.2992, 'train_samples_per_second': 22.514, 'train_steps_per_second': 0.188, 'total_flos': 42488815650816.0, 'train_loss': 0.8327455043792724})

### Saving model fp16


In [31]:
trainer.model.save_pretrained("gemma-genz-270M-peft-fp16")
tokenizer.save_pretrained("gemma-genz-270M-peft-fp16")

('gemma-genz-270M-peft-fp16/tokenizer_config.json',
 'gemma-genz-270M-peft-fp16/special_tokens_map.json',
 'gemma-genz-270M-peft-fp16/chat_template.jinja',
 'gemma-genz-270M-peft-fp16/tokenizer.model',
 'gemma-genz-270M-peft-fp16/added_tokens.json',
 'gemma-genz-270M-peft-fp16/tokenizer.json')

In [32]:
# saving to local
import shutil

# Path to your saved model
model_folder = "gemma-genz-270M-peft-fp16"
zip_name = "gemma-genz-270M-peft-fp16.zip"

# Create zip
shutil.make_archive("gemma-genz-270M-peft-fp16", 'zip', model_folder)

'/content/gemma-genz-270M-peft-fp16.zip'

In [33]:
from google.colab import files

files.download(zip_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Inference (fp16)

In [54]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

device = 0 if torch.cuda.is_available() else -1  # pipeline uses int device

base_model_name = "google/gemma-3-270m-it"
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32)

peft_model_path = "/content/gemma-genz-270M-peft-fp16"
model = PeftModel.from_pretrained(base_model, peft_model_path)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device
)

def generate_text_pipeline_fp16(prompts, max_new_tokens=100, temperature=1.0, top_k=50, top_p=0.95, do_sample=True):
    """
    Generate text using the preloaded PEFT LoRA model via Hugging Face pipeline.

    Args:
        prompts (str or list[str]): Single prompt or list of prompts.
        max_new_tokens (int): Maximum tokens to generate beyond input.
        temperature (float): Sampling temperature.
        top_k (int): Top-k sampling.
        top_p (float): Top-p sampling (nucleus).
        do_sample (bool): Whether to sample or use greedy decoding.

    Returns:
        list[str]: Generated text(s).
    """
    if isinstance(prompts, str):
        prompts = [prompts]

    outputs = text_gen_pipeline(
        prompts,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=do_sample
    )

    return outputs

Device set to use cuda:0


In [67]:
prompt = "job market is dead bro!"
generated = generate_text_pipeline_fp16(prompt, max_new_tokens=50)
generated

[[{'generated_text': 'job market is dead bro! Opportunities are still scarce, but demand outstrihes supply. #jobsearching #careeradvice #FOMO\n'}]]

In [63]:
prompt = "life seems to be slipping away from the hands yaar"
generated = generate_text_pipeline(prompt)
generated

[[{'generated_text': 'life seems to be slipping away from the hands yaar. I want to do something meaningful, not just survive. This is the only way to ground myself. NO. Thanks. #MeaningfulLiving #MemoryLane\n'}]]