In [1]:
# Install the requirements in Google Colab
!pip install transformers datasets trl huggingface_hub

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.13.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m25.6 MB/s[0

In [2]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format, DataCollatorForCompletionOnlyLM
import torch

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [37]:
# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name
).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

In [38]:
# Set up the chat format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

In [39]:
# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-135M-SFT-smoltalk"
finetune_tags = ["smol-course","sft_finetuning"]

In [40]:
# Let's test the base model before training
prompt = "Write a haiku about programming"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print("Before training:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Before training:
user
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a haiku about programming
Write a


In [41]:
# Load a sample dataset
from datasets import load_dataset
ds = load_dataset(path="HuggingFaceTB/smoltalk", name="everyday-conversations")
ds

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

In [42]:
ds['train'][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [43]:
def process_messages(samples):
    # Add 'human' role logic
    result = []
    for x in samples['messages']:
        if x[-1]['role'] == 'user':  # Add condition for 'human' role
            result.append(x)
        else:
            result.append(x[:-1])  # Truncate the message if condition is not met
    return {'messages': result}

# Applying the function on a dataset
dataset = ds.map(process_messages, batched=True)

In [44]:
# Configure the SFTTrainer
sft_config = SFTConfig(
    output_dir="./sft_output",
    max_steps=500,  # Adjust based on dataset size and desired training duration
    per_device_train_batch_size=16,  # Set according to your GPU memory capacity
    learning_rate=5e-5,  # Common starting point for fine-tuning
    logging_steps=50,  # Frequency for finding training metrics
    save_steps=50,  # Frequency for saving model checkpoints
    eval_strategy="steps",  # Evaluate the model at regular intervals
    eval_steps=50,  # Frequency of evaluation
    use_mps_device=(
        True if device == "mps" else False
    ),  # Use MPS for mixed precision training
    hub_model_id=finetune_name,  # Set a unique name for your model
    report_to=[]
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=ds["train"],
    processing_class=tokenizer,
    eval_dataset=ds["test"],
)

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [45]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
50,1.2505,1.109389
100,1.0655,1.067429
150,1.0156,1.044449
200,0.8958,1.034744
250,0.8814,1.030149
300,0.8621,1.029914
350,0.7885,1.028845
400,0.7895,1.027438
450,0.767,1.030825
500,0.7417,1.031717


TrainOutput(global_step=500, training_loss=0.9057471542358398, metrics={'train_runtime': 1030.1888, 'train_samples_per_second': 7.766, 'train_steps_per_second': 0.485, 'total_flos': 1302438402256896.0, 'train_loss': 0.9057471542358398, 'epoch': 3.52112676056338})

In [46]:
# Save the model
trainer.save_model(f"./{finetune_name}")

In [47]:
trainer.push_to_hub(tags=finetune_tags)

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ParitKansal/SmolLM2-135M-SFT-smoltalk/commit/97e8fed11e0a365f181dc40fc9b8ab4a87a98e99', commit_message='End of training', commit_description='', oid='97e8fed11e0a365f181dc40fc9b8ab4a87a98e99', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ParitKansal/SmolLM2-135M-SFT-smoltalk', endpoint='https://huggingface.co', repo_type='model', repo_id='ParitKansal/SmolLM2-135M-SFT-smoltalk'), pr_revision=None, pr_num=None)

In [49]:
# Test the fine-tuned model on the same prompt

# Let's test the base model before training
prompt = "Write about a programming lang"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

# TODO: use the fine-tuned to model generate a response, just like with the base example.
outputs = model.generate(**inputs, max_new_tokens=100)
print("After training:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

After training:
user
Write about a programming lang

What is a programming language?

A programming language is a set of instructions that a computer can understand and execute. It is a set of rules that tells the computer what to do. It is a language that is easy to learn and use.

What is a programming language used for?

A programming language is used to create software programs. It is a language that is used to create computer programs. It is a language that is easy to learn and use.

What
