{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":12897,"status":"ok","timestamp":1730276828713,"user":{"displayName":"Kiệt Nguyễn","userId":"14166824745985016448"},"user_tz":-420},"id":"55IO7j85Rit0"},"outputs":[],"source":["from datasets import load_dataset\n","from transformers import GPT2LMHeadModel, GPT2Tokenizer\n","import torch"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4059,"status":"ok","timestamp":1730276832768,"user":{"displayName":"Kiệt Nguyễn","userId":"14166824745985016448"},"user_tz":-420},"id":"fAw5L_jRabT0","outputId":"42e3feb6-ca58-4d69-bbea-f21c4a956611"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n","  warnings.warn(\n"]}],"source":["ds = load_dataset(\"higgsfield/school-math-questions\")"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7,"status":"ok","timestamp":1730276832769,"user":{"displayName":"Kiệt Nguyễn","userId":"14166824745985016448"},"user_tz":-420},"id":"aEqw1iLkeJbb","outputId":"d0203aae-8b18-4f8c-a04b-70485844b5ce"},"outputs":[{"name":"stdout","output_type":"stream","text":["{'prompt': 'Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?\\nAnswer: ', 'completion': 'Weng earns 12/60 = $0.2 per minute.\\nWorking 50 minutes, she earned 0.2 x 50 = $10.'}\n"]}],"source":["print(ds['train'][1])"]},{"cell_type":"code","execution_count":4,"metadata":{"executionInfo":{"elapsed":523,"status":"ok","timestamp":1730276833286,"user":{"displayName":"Kiệt Nguyễn","userId":"14166824745985016448"},"user_tz":-420},"id":"LBtRbgdXeUJT"},"outputs":[],"source":["qa_pairs = [(item['prompt'], item['completion']) for item in ds['train']]"]},{"cell_type":"code","execution_count":5,"metadata":{"executionInfo":{"elapsed":4,"status":"ok","timestamp":1730276833744,"user":{"displayName":"Kiệt Nguyễn","userId":"14166824745985016448"},"user_tz":-420},"id":"3bEfklO-evHp"},"outputs":[],"source":["class MathDataset(torch.utils.data.Dataset):\n","    def __init__(self, qa_pairs, tokenizer, max_length=128):\n","        self.qa_pairs = qa_pairs\n","        self.tokenizer = tokenizer\n","        self.max_length = max_length\n","\n","    def __len__(self):\n","        return len(self.qa_pairs)\n","\n","    def __getitem__(self, idx):\n","        question, answer = self.qa_pairs[idx]\n","        input_text = f\"Q: {question} A:\"\n","\n","        # Tokenize and pad input and target sequences\n","        input_ids = self.tokenizer.encode(input_text, truncation=True, padding=\"max_length\", max_length=self.max_length, return_tensors=\"pt\").squeeze(0)\n","        target_ids = self.tokenizer.encode(answer.strip(), truncation=True, padding=\"max_length\", max_length=self.max_length, return_tensors=\"pt\").squeeze(0)\n","\n","        # Set the labels to -100 where input_ids are padding tokens\n","        target_ids[target_ids == self.tokenizer.pad_token_id] = -100\n","\n","        return {\n","            \"input_ids\": input_ids,\n","            \"labels\": target_ids,\n","        }"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true,"base_uri":"https://localhost:8080/","height":269},"id":"8-quzwBCe4YV"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n","  warnings.warn(\n","\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n","\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.\n","\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mstar-nguyenanhkiet-2302\u001b[0m (\u001b[33mstar-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"]},{"data":{"text/html":["Tracking run with wandb version 0.18.5"],"text/plain":["\u003cIPython.core.display.HTML object\u003e"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Run data is saved locally in \u003ccode\u003e/content/wandb/run-20241030_082735-rzcfcgar\u003c/code\u003e"],"text/plain":["\u003cIPython.core.display.HTML object\u003e"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Syncing run \u003cstrong\u003e\u003ca href='https://wandb.ai/star-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm/huggingface/runs/rzcfcgar' target=\"_blank\"\u003e./results\u003c/a\u003e\u003c/strong\u003e to \u003ca href='https://wandb.ai/star-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm/huggingface' target=\"_blank\"\u003eWeights \u0026 Biases\u003c/a\u003e (\u003ca href='https://wandb.me/run' target=\"_blank\"\u003edocs\u003c/a\u003e)\u003cbr/\u003e"],"text/plain":["\u003cIPython.core.display.HTML object\u003e"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View project at \u003ca href='https://wandb.ai/star-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm/huggingface' target=\"_blank\"\u003ehttps://wandb.ai/star-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm/huggingface\u003c/a\u003e"],"text/plain":["\u003cIPython.core.display.HTML object\u003e"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View run at \u003ca href='https://wandb.ai/star-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm/huggingface/runs/rzcfcgar' target=\"_blank\"\u003ehttps://wandb.ai/star-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm/huggingface/runs/rzcfcgar\u003c/a\u003e"],"text/plain":["\u003cIPython.core.display.HTML object\u003e"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["\n","    \u003cdiv\u003e\n","      \n","      \u003cprogress value='23' max='13188' style='width:300px; height:20px; vertical-align: middle;'\u003e\u003c/progress\u003e\n","      [   23/13188 02:23 \u003c 24:57:03, 0.15 it/s, Epoch 0.01/3]\n","    \u003c/div\u003e\n","    \u003ctable border=\"1\" class=\"dataframe\"\u003e\n","  \u003cthead\u003e\n"," \u003ctr style=\"text-align: left;\"\u003e\n","      \u003cth\u003eStep\u003c/th\u003e\n","      \u003cth\u003eTraining Loss\u003c/th\u003e\n","    \u003c/tr\u003e\n","  \u003c/thead\u003e\n","  \u003ctbody\u003e\n","  \u003c/tbody\u003e\n","\u003c/table\u003e\u003cp\u003e"],"text/plain":["\u003cIPython.core.display.HTML object\u003e"]},"metadata":{},"output_type":"display_data"}],"source":["model_name = \"gpt2\"\n","tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","tokenizer.pad_token = tokenizer.eos_token\n","model = GPT2LMHeadModel.from_pretrained(model_name)\n","\n","math_dataset = MathDataset(qa_pairs, tokenizer)\n","\n","from transformers import Trainer, TrainingArguments\n","\n","# Set training arguments\n","training_args = TrainingArguments(\n","    output_dir=\"./results\",\n","    num_train_epochs=3,\n","    per_device_train_batch_size=2,\n","    save_steps=10,\n","    save_total_limit=2,\n",")\n","\n","# Create a Trainer\n","trainer = Trainer(\n","    model=model,\n","    args=training_args,\n","    train_dataset=math_dataset,\n",")\n","\n","# Fine-tune the model\n","trainer.train()\n","\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"N8Puxcd9fEjk"},"outputs":[],"source":["class MathChatBot:\n","    def __init__(self, model_name=\"gpt2\"):\n","        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","        self.model = GPT2LMHeadModel.from_pretrained(model_name)\n","\n","    def get_response(self, question):\n","        input_text = f\"Q: {question} A:\"\n","        input_ids = self.tokenizer.encode(input_text, return_tensors=\"pt\")\n","\n","        output = self.model.generate(input_ids, max_length=50, num_return_sequences=1)\n","        answer = self.tokenizer.decode(output[0], skip_special_tokens=True)\n","        return answer.split(\"A:\")[-1].strip()\n","\n","# Usage\n","if __name__ == \"__main__\":\n","    bot = MathChatBot()\n","    while True:\n","        user_input = input(\"You: \")\n","        if user_input.lower() == \"exit\":\n","            print(\"Exiting chat...\")\n","            break\n","        response = bot.get_response(user_input)\n","        print(f\"Bot: {response}\")"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyPsoFlCkmVN/hpZEOqtIFuH","name":"","version":""},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}