{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (4.49.0)\n", "Requirement already satisfied: datasets in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (3.3.1)\n", "Requirement already satisfied: torch in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (2.2.1+cu121)\n", "Requirement already satisfied: scikit-learn in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (1.3.2)\n", "Requirement already satisfied: accelerate in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (1.4.0)\n", "Requirement already satisfied: filelock in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (3.17.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (0.29.1)\n", "Requirement already satisfied: numpy>=1.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (24.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: tokenizers<0.22,>=0.21 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (0.21.0)\n", "Requirement already satisfied: safetensors>=0.4.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (0.5.2)\n", "Requirement already satisfied: tqdm>=4.27 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets) (19.0.1)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets) (2.1.4)\n", "Requirement already satisfied: xxhash in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets) (3.5.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.12.0)\n", "Requirement already satisfied: aiohttp in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets) (3.11.12)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (4.12.2)\n", "Requirement already satisfied: sympy in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (1.13.3)\n", "Requirement already satisfied: networkx in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (3.4.2)\n", "Requirement already satisfied: jinja2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (3.1.5)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.19.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (2.19.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: triton==2.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch) (2.2.0)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.8.61)\n", "Requirement already satisfied: scipy>=1.5.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from scikit-learn) (1.11.4)\n", "Requirement already satisfied: joblib>=1.1.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from scikit-learn) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from scikit-learn) (3.5.0)\n", "Requirement already satisfied: psutil in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from accelerate) (6.1.1)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (2.4.4)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.2)\n", "Requirement already satisfied: async-timeout<6.0,>=4.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (5.0.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (25.1.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (0.2.1)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets) (1.18.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from requests->transformers) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from requests->transformers) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from requests->transformers) (2025.1.31)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from jinja2->torch) (3.0.2)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pandas->datasets) (2025.1)\n", "Requirement already satisfied: tzdata>=2022.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pandas->datasets) (2025.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from sympy->torch) (1.3.0)\n", "Requirement already satisfied: six>=1.5 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install transformers datasets torch scikit-learn accelerate\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import RobertaTokenizer\n", "import pandas as pd\n", "\n", "# Load IMDB dataset\n", "dataset = load_dataset(\"imdb\")\n", "\n", "# Load RoBERTa tokenizer\n", "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n", "\n", "# Tokenization function\n", "def tokenize_function(examples):\n", " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True, max_length=512)\n", "\n", "# Tokenize dataset\n", "tokenized_datasets = dataset.map(tokenize_function, batched=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorWithPadding\n", "\n", "# Convert labels to PyTorch format\n", "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", "tokenized_datasets.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n", "\n", "# Data collator to handle padding dynamically\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import RobertaForSequenceClassification\n", "\n", "# Load RoBERTa with classification head\n", "model = RobertaForSequenceClassification.from_pretrained(\"roberta-base\", num_labels=2)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Transformers version: 4.49.0\n", "Accelerate version: 1.4.0\n", "Torch version: 2.2.1+cu121\n" ] } ], "source": [ "import transformers\n", "import accelerate\n", "\n", "print(\"Transformers version:\", transformers.__version__)\n", "print(\"Accelerate version:\", accelerate.__version__)\n", "import torch\n", "print(\"Torch version:\", torch.__version__)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n", "/tmp/ipykernel_1535/2580171433.py:18: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n", " trainer = Trainer(\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [9375/9375 42:38, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation Loss
10.3472000.283987
20.2232000.214965
30.1363000.250880

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=9375, training_loss=0.24904020670572916, metrics={'train_runtime': 2559.5896, 'train_samples_per_second': 29.302, 'train_steps_per_second': 3.663, 'total_flos': 1.9733329152e+16, 'train_loss': 0.24904020670572916, 'epoch': 3.0})" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import TrainingArguments, Trainer\n", "\n", "# Define training arguments\n", "training_args = TrainingArguments(\n", " output_dir=\"./roberta_imdb\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=8,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", " logging_dir=\"./logs\",\n", " logging_steps=200,\n", " load_best_model_at_end=True\n", ")\n", "\n", "# Define Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"test\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator\n", ")\n", "\n", "# Train the model\n", "trainer.train()\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [3125/3125 03:08]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "('roberta_imdb_finetuned/tokenizer_config.json',\n", " 'roberta_imdb_finetuned/special_tokens_map.json',\n", " 'roberta_imdb_finetuned/vocab.json',\n", " 'roberta_imdb_finetuned/merges.txt',\n", " 'roberta_imdb_finetuned/added_tokens.json')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Evaluate on test set\n", "trainer.evaluate()\n", "\n", "# Save model & tokenizer\n", "model.save_pretrained(\"roberta_imdb_finetuned\")\n", "tokenizer.save_pretrained(\"roberta_imdb_finetuned\")\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Zipped folder saved as roberta_imdb_finetuned.zip\n" ] } ], "source": [ "import shutil\n", "\n", "# Path to the directory\n", "folder_path = \"roberta_imdb_finetuned\"\n", "zip_filename = \"roberta_imdb_finetuned.zip\"\n", "\n", "# Create a zip archive\n", "shutil.make_archive(zip_filename.replace(\".zip\", \"\"), 'zip', folder_path)\n", "\n", "print(f\"Zipped folder saved as {zip_filename}\")" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }