DeFamy
/

BERT-FineTune-CyberbullyingMalay

Text Classification

Transformers

PyTorch

bert

Model card Files Files and versions Community

DeFamy commited on Dec 25, 2023

Commit

0af51d7

1 Parent(s): d69b533

Upload train_model.ipynb

Browse files

Files changed (1) hide show

train_model.ipynb +909 -0

train_model.ipynb ADDED Viewed

	@@ -0,0 +1,909 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XLhB2j_Hemio"
+      },
+      "source": [
+        "## Read the dataset csv file"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hgYEtrYgemir",
+        "outputId": "d3ddedc7-8bd7-4ba9-c82e-68e4eb1309c3"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Unnamed: 0</th>\n",
+              "      <th>Text</th>\n",
+              "      <th>target</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>0.0</td>\n",
+              "      <td>polis tangkap</td>\n",
+              "      <td>NonCyberbully</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>1.0</td>\n",
+              "      <td>kenapa lokasi kebakaran terlalu spesifik</td>\n",
+              "      <td>NonCyberbully</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>2.0</td>\n",
+              "      <td>menyesal tanya nak for birthday</td>\n",
+              "      <td>NonCyberbully</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>3.0</td>\n",
+              "      <td>meriah tah</td>\n",
+              "      <td>NonCyberbully</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>4.0</td>\n",
+              "      <td>asal bs kelar kerja jam sik kl baru diajak mee...</td>\n",
+              "      <td>NonCyberbully</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   Unnamed: 0                                               Text  \\\n",
+              "0         0.0                                     polis tangkap    \n",
+              "1         1.0          kenapa lokasi kebakaran terlalu spesifik    \n",
+              "2         2.0                   menyesal tanya nak for birthday    \n",
+              "3         3.0                                        meriah tah    \n",
+              "4         4.0  asal bs kelar kerja jam sik kl baru diajak mee...   \n",
+              "\n",
+              "          target  \n",
+              "0  NonCyberbully  \n",
+              "1  NonCyberbully  \n",
+              "2  NonCyberbully  \n",
+              "3  NonCyberbully  \n",
+              "4  NonCyberbully  "
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "df = pd.read_csv('C:/Users/user/Documents/PSM/BERT_Ver2/Transformers-Text-Classification-BERT-Blog-main/input/Tagged_MixedNew.csv')\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fGUtFkVfemit"
+      },
+      "source": [
+        "## Process the data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7C3uRWECemiu",
+        "outputId": "8e764d84-010d-4e42-987a-af7162627f6e",
+        "colab": {
+          "referenced_widgets": [
+            "042c8b0b8dcf42eb84660c93778d8ea7",
+            "4ab6074437a849f79be038b043025283",
+            "9aed4d88c18e4e28a1efbbed94331228"
+          ]
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "042c8b0b8dcf42eb84660c93778d8ea7",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading (…)okenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "C:\\Users\\user\\anaconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:133: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\user\\.cache\\huggingface\\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+            "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+            "  warnings.warn(message)\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "4ab6074437a849f79be038b043025283",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/233k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "9aed4d88c18e4e28a1efbbed94331228",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "#from transformers import BertTokenizer\n",
+        "#tokenizer = BertTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n",
+        "\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained('mesolitica/bert-base-standard-bahasa-cased')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ks3XobW0emiu"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n",
+        "import torch\n",
+        "from transformers import TrainingArguments, Trainer\n",
+        "from transformers import BertTokenizer, BertForSequenceClassification"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0ZZx6mUdemiv"
+      },
+      "outputs": [],
+      "source": [
+        "def process_data(row):\n",
+        "\n",
+        "    text = row['Text']\n",
+        "    text = str(text)\n",
+        "    text = ' '.join(text.split())\n",
+        "\n",
+        "    encodings = tokenizer(text, padding=\"max_length\", truncation=True, max_length=128)\n",
+        "\n",
+        "    label = 0\n",
+        "    if row['target'] == 'Cyberbully':\n",
+        "        label += 1\n",
+        "\n",
+        "    encodings['label'] = label\n",
+        "    encodings['Text'] = text\n",
+        "\n",
+        "    return encodings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MaFmqSc-emiv",
+        "outputId": "03eb6491-b646-45dd-ef3d-318c81313430"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "{'input_ids': [2, 2039, 3058, 9857, 1606, 1164, 2161, 8062, 1219, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 0, 'Text': 'Saya suka masakan beliau dan cara penyampaiannya'}\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(process_data({\n",
+        "    'Text': 'Saya suka masakan beliau dan cara penyampaiannya',\n",
+        "    'target': 'NonCyberbully'\n",
+        "}))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Lel-2lqKemiw"
+      },
+      "outputs": [],
+      "source": [
+        "processed_data = []\n",
+        "\n",
+        "for i in range(len(df[:1383])):\n",
+        "    processed_data.append(process_data(df.iloc[i]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x_DGsKzHemiw"
+      },
+      "source": [
+        "## Generate the dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oc_NsbnXemiw"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.model_selection import train_test_split\n",
+        "\n",
+        "new_df = pd.DataFrame(processed_data)\n",
+        "\n",
+        "train_df, valid_df = train_test_split(\n",
+        "    new_df,\n",
+        "    test_size=0.2,\n",
+        "    random_state=2022\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4qSci5CRemix"
+      },
+      "outputs": [],
+      "source": [
+        "import pyarrow as pa\n",
+        "from datasets import Dataset\n",
+        "\n",
+        "train_hg = Dataset(pa.Table.from_pandas(train_df))\n",
+        "valid_hg = Dataset(pa.Table.from_pandas(valid_df))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xDgnim7iemix",
+        "outputId": "59858161-59a4-4731-fbfc-7e30a1246eed"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "Dataset({\n",
+              "    features: ['Text', 'attention_mask', 'input_ids', 'label', 'token_type_ids', '__index_level_0__'],\n",
+              "    num_rows: 277\n",
+              "})"
+            ]
+          },
+          "execution_count": 12,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "valid_hg"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8Uqq0cKKemiy"
+      },
+      "source": [
+        "## Create a model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QQkDAXmRemiz",
+        "outputId": "e00faff0-c7d7-456d-dab2-73d9839c0274",
+        "colab": {
+          "referenced_widgets": [
+            "b9faad28a43547029c8b13ab639f8d05",
+            "6175ea4206304020823d86e0bbc23298"
+          ]
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "b9faad28a43547029c8b13ab639f8d05",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading (…)lve/main/config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "6175ea4206304020823d86e0bbc23298",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of the model checkpoint at mesolitica/bert-base-standard-bahasa-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
+            "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mesolitica/bert-base-standard-bahasa-cased and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        }
+      ],
+      "source": [
+        "#from transformers import BertForSequenceClassification\n",
+        "\n",
+        "#model = BertForSequenceClassification.from_pretrained(\n",
+        "#    'malay-huggingface/bert-tiny-bahasa-cased',\n",
+        "#    num_labels=2\n",
+        "#)\n",
+        "\n",
+        "\n",
+        "from transformers import AutoModelForSequenceClassification\n",
+        "\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(\n",
+        "    'mesolitica/bert-base-standard-bahasa-cased',\n",
+        "    num_labels=2\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ifvtnwBMemi1"
+      },
+      "outputs": [],
+      "source": [
+        "def compute_metrics(p):\n",
+        "    print(type(p))\n",
+        "    pred, labels = p\n",
+        "    pred = np.argmax(pred, axis=1)\n",
+        "\n",
+        "    accuracy = accuracy_score(y_true=labels, y_pred=pred)\n",
+        "    recall = recall_score(y_true=labels, y_pred=pred)\n",
+        "    precision = precision_score(y_true=labels, y_pred=pred)\n",
+        "    f1 = f1_score(y_true=labels, y_pred=pred)\n",
+        "\n",
+        "    return {\"accuracy\": accuracy, \"precision\": precision, \"recall\": recall, \"f1\": f1}\n",
+        ""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "50Xy9P7Remi2"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import TrainingArguments, Trainer\n",
+        "\n",
+        "training_args = TrainingArguments(output_dir=\"./result\", evaluation_strategy=\"epoch\")\n",
+        "\n",
+        "trainer = Trainer(\n",
+        "    model=model,\n",
+        "    args=training_args,\n",
+        "    train_dataset=train_hg,\n",
+        "    eval_dataset=valid_hg,\n",
+        "    tokenizer=tokenizer,\n",
+        "    compute_metrics=compute_metrics\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "myIstfgJemi3"
+      },
+      "source": [
+        "## Train and Evaluate the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-UtAkNHUemi4",
+        "outputId": "5af038f3-a77c-41eb-e48d-747a8e776e38"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "C:\\Users\\user\\anaconda3\\lib\\site-packages\\transformers\\optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n",
+            "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='417' max='417' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [417/417 56:36, Epoch 3/3]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "      <th>Accuracy</th>\n",
+              "      <th>Precision</th>\n",
+              "      <th>Recall</th>\n",
+              "      <th>F1</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <td>1</td>\n",
+              "      <td>No log</td>\n",
+              "      <td>0.493876</td>\n",
+              "      <td>0.779783</td>\n",
+              "      <td>0.657343</td>\n",
+              "      <td>0.886792</td>\n",
+              "      <td>0.755020</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>2</td>\n",
+              "      <td>No log</td>\n",
+              "      <td>0.542367</td>\n",
+              "      <td>0.870036</td>\n",
+              "      <td>0.850000</td>\n",
+              "      <td>0.801887</td>\n",
+              "      <td>0.825243</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>3</td>\n",
+              "      <td>No log</td>\n",
+              "      <td>0.725669</td>\n",
+              "      <td>0.848375</td>\n",
+              "      <td>0.820000</td>\n",
+              "      <td>0.773585</td>\n",
+              "      <td>0.796117</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "<class 'transformers.trainer_utils.EvalPrediction'>\n",
+            "<class 'transformers.trainer_utils.EvalPrediction'>\n",
+            "<class 'transformers.trainer_utils.EvalPrediction'>\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "TrainOutput(global_step=417, training_loss=0.2771467213436282, metrics={'train_runtime': 3405.0836, 'train_samples_per_second': 0.974, 'train_steps_per_second': 0.122, 'total_flos': 218053287129600.0, 'train_loss': 0.2771467213436282, 'epoch': 3.0})"
+            ]
+          },
+          "execution_count": 16,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "trainer.train()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fZYGhNyremi4",
+        "outputId": "5119c379-d7e9-48f7-9137-d788f99a3731"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='35' max='35' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [35/35 00:43]\n",
+              "    </div>\n",
+              "    "
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "<class 'transformers.trainer_utils.EvalPrediction'>\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "{'eval_loss': 0.7256694436073303,\n",
+              " 'eval_accuracy': 0.8483754512635379,\n",
+              " 'eval_precision': 0.82,\n",
+              " 'eval_recall': 0.7735849056603774,\n",
+              " 'eval_f1': 0.796116504854369,\n",
+              " 'eval_runtime': 44.9419,\n",
+              " 'eval_samples_per_second': 6.164,\n",
+              " 'eval_steps_per_second': 0.779,\n",
+              " 'epoch': 3.0}"
+            ]
+          },
+          "execution_count": 17,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "trainer.evaluate()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tlw24Ccdemi5"
+      },
+      "source": [
+        "## Save the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "69n4eVBHemi6"
+      },
+      "outputs": [],
+      "source": [
+        "model.save_pretrained('./model/')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gC9qDoERemi6",
+        "outputId": "a5514df7-d322-48b9-df27-c799dca6d884"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Looking in indexes: https://download.pytorch.org/whl/cu117\n",
+            "Requirement already satisfied: torch in c:\\users\\user\\anaconda3\\lib\\site-packages (2.0.1+cu118)\n",
+            "Requirement already satisfied: torchvision in c:\\users\\user\\anaconda3\\lib\\site-packages (0.15.2+cu117)\n",
+            "Requirement already satisfied: torchaudio in c:\\users\\user\\anaconda3\\lib\\site-packages (2.0.2+cu117)\n",
+            "Requirement already satisfied: sympy in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (1.11.1)\n",
+            "Requirement already satisfied: jinja2 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (3.1.2)\n",
+            "Requirement already satisfied: filelock in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (3.9.0)\n",
+            "Requirement already satisfied: networkx in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (2.5.1)\n",
+            "Requirement already satisfied: typing-extensions in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (4.4.0)\n",
+            "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (9.4.0)\n",
+            "Requirement already satisfied: numpy in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (1.23.5)\n",
+            "Requirement already satisfied: requests in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (2.28.1)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from jinja2->torch) (2.1.1)\n",
+            "Requirement already satisfied: decorator<5,>=4.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from networkx->torch) (4.4.2)\n",
+            "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2.0.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (1.26.14)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2.10)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2022.12.7)\n",
+            "Requirement already satisfied: mpmath>=0.19 in c:\\users\\user\\anaconda3\\lib\\site-packages (from sympy->torch) (1.2.1)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3NBugUKAemi7"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-W3_K_Kjemi7"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yMiT54Ddemi7"
+      },
+      "source": [
+        "## Load the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mEFnUaM3emi7"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "from transformers import AutoModelForSequenceClassification\n",
+        "\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "\n",
+        "new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zkDeulcTemi8",
+        "outputId": "2500b324-398b-471b-9c08-48fa79ea9de3"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "ERROR: torch-1.0.1-cp36-cp36m-win_amd64.whl is not a supported wheel on this platform.\n",
+            "\n",
+            "[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
+            "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: torchvision in c:\\users\\user\\anaconda3\\lib\\site-packages (0.14.0)\n",
+            "Requirement already satisfied: typing-extensions in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (4.1.1)\n",
+            "Requirement already satisfied: requests in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (2.27.1)\n",
+            "Requirement already satisfied: torch==1.13.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (1.13.0)\n",
+            "Requirement already satisfied: numpy in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (1.24.2)\n",
+            "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (9.0.1)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (3.3)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2.0.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2022.9.24)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (1.26.9)\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
+            "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install https://download.pytorch.org/whl/cpu/torch-1.0.1-cp36-cp36m-win_amd64.whl\n",
+        "!pip install torchvision"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WtI-WDBhemi8"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import AutoTokenizer\n",
+        "\n",
+        "new_tokenizer = AutoTokenizer.from_pretrained('mesolitica/bert-base-standard-bahasa-cased')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S2X_uPYJemi9"
+      },
+      "source": [
+        "## Get predictions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qXKQEiWxemi9"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import numpy as np\n",
+        "\n",
+        "def get_prediction(text):\n",
+        "    encoding = new_tokenizer(text, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=128)\n",
+        "    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}\n",
+        "\n",
+        "    outputs = new_model(**encoding)\n",
+        "\n",
+        "    logits = outputs.logits\n",
+        "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "    sigmoid = torch.nn.Sigmoid()\n",
+        "    print(sigmoid)\n",
+        "    probs = sigmoid(logits.squeeze().cpu())\n",
+        "    probs = probs.detach().numpy()\n",
+        "    label = np.argmax(probs, axis=-1)\n",
+        "\n",
+        "    if label == 1:\n",
+        "        return {\n",
+        "            'Target': 'Cyberbully',\n",
+        "            'probability': probs[1]\n",
+        "        }\n",
+        "    else:\n",
+        "        return {\n",
+        "            'Target': 'Not Cyberbully',\n",
+        "            'probability': probs[0]\n",
+        "        }"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NcYq4vmVemi9"
+      },
+      "outputs": [],
+      "source": [
+        "# dir()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CS_2FfAeemi_",
+        "outputId": "106776a5-fced-4329-aa1f-5970a4a71386"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Sigmoid()\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "{'Target': 'Cyberbully', 'probability': 0.9651532}"
+            ]
+          },
+          "execution_count": 24,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "get_prediction('Aku malas kerja dengan orang macam ni menyusahkan orang je')"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.9"
+    },
+    "vscode": {
+      "interpreter": {
+        "hash": "173fe52379437b78f95c8980b8ee9f2930fd7b56889ab31a72735475ddc10c81"
+      }
+    },
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}