{ "cells": [ { "cell_type": "markdown", "id": "5946df15", "metadata": {}, "source": [ "# Example of use of ToxiFrench model" ] }, { "cell_type": "markdown", "id": "209da6d2", "metadata": {}, "source": [ "## Libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "e421addd", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", "from peft import PeftModel\n", "from rich.console import Console\n", "from rich.panel import Panel\n", "import os " ] }, { "cell_type": "markdown", "id": "332b3f5c", "metadata": {}, "source": [ "## Global settings and variables" ] }, { "cell_type": "code", "execution_count": 2, "id": "a14bba57", "metadata": {}, "outputs": [], "source": [ "# If you are using a proxy, set it up here (optional, you can comment these lines if not needed)\n", "os.environ[\"HTTP_PROXY\"] = \"socks5h://127.0.0.1:1080\"\n", "os.environ[\"HTTPS_PROXY\"] = \"socks5h://127.0.0.1:1080\"\n", "\n", "# Choose which adapter to load\n", "target_adapter_name = \"rec\" # Among the following six configurations : \"odc\", \"oeb\", \"oec\", \"rdc\", \"reb\", \"rec\"\n", "\n", "# Load the base model\n", "base_model_name = \"Qwen/Qwen3-4B\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "b2a86231", "metadata": {}, "outputs": [], "source": [ "console = Console()" ] }, { "cell_type": "code", "execution_count": 4, "id": "3bbaa5bf", "metadata": {}, "outputs": [], "source": [ "bnb_config = BitsAndBytesConfig(**{\n", " \"load_in_4bit\": True,\n", " \"bnb_4bit_use_double_quant\": True,\n", " \"bnb_4bit_quant_type\": \"nf4\",\n", " \"bnb_4bit_compute_dtype\": torch.float16\n", " })" ] }, { "cell_type": "markdown", "id": "560465f5", "metadata": {}, "source": [ "## Load the model" ] }, { "cell_type": "code", "execution_count": 5, "id": "44494b9f", "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\n", " base_model_name,\n", " use_fast=True,\n", " trust_remote_code=True\n", " )\n", "\n", "if tokenizer.pad_token is None:\n", " tokenizer.pad_token = tokenizer.eos_token\n", " print(\"Tokenizer `pad_token` was None, set to `eos_token`.\")\n", "\n", "tokenizer.padding_side = 'left' " ] }, { "cell_type": "code", "execution_count": 6, "id": "4b90a147", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6796b5b76bc2493daf8fdc0169de053f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model = AutoModelForCausalLM.from_pretrained(\n", " base_model_name,\n", " quantization_config=bnb_config,\n", " trust_remote_code=True,\n", " sliding_window=None,\n", " )\n", "if model.generation_config.pad_token_id is None and tokenizer.pad_token_id is not None:\n", " model.generation_config.pad_token_id = tokenizer.pad_token_id\n", " print(\"Model `generation_config.pad_token_id` set from tokenizer.\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "5c105265", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Warning: Vocab size mismatch.Model embeddings: 151936, Tokenizer vocab: 151669\n", "\n" ], "text/plain": [ "\u001b[33mWarning: Vocab size mismatch.\u001b[0mModel embeddings: \u001b[1;36m151936\u001b[0m, Tokenizer vocab: \u001b[1;36m151669\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Resizing model token embeddings to match tokenizer...\n",
"
\n"
],
"text/plain": [
"Resizing model token embeddings to match tokenizer\u001b[33m...\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Resized model embeddings to: 151669\n", "\n" ], "text/plain": [ "\u001b[1;32mResized model embeddings to: \u001b[0m\u001b[1;32m151669\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "special_tokens_to_add = {\n", " \"additional_special_tokens\": [\n", " ]\n", " }\n", "tokenizer.add_special_tokens(special_tokens_to_add)\n", "\n", "model_embedding_size = model.get_input_embeddings().weight.size(0)\n", "tokenizer_vocab_size = len(tokenizer)\n", "\n", "if model_embedding_size != tokenizer_vocab_size:\n", " console.print(f\"[yellow]Warning: Vocab size mismatch.[/yellow]\"\n", " f\"Model embeddings: {model_embedding_size}, \"\n", " f\"Tokenizer vocab: {tokenizer_vocab_size}\")\n", " console.print(\"Resizing model token embeddings to match tokenizer...\")\n", " model.resize_token_embeddings(tokenizer_vocab_size)\n", "\n", " # Verify the resize\n", " new_embedding_size = model.get_input_embeddings().weight.size(0)\n", " console.print(f\"[bold green]Resized model embeddings to: {new_embedding_size}[/bold green]\")\n", "else:\n", " console.print(\"Model embedding size and tokenizer vocabulary size are already aligned.\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "ed574fee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully loaded the 'rec' adapter!\n" ] } ], "source": [ "# Load the specific adapter by name from the repository\n", "adapter_repo_id = \"Naela00/ToxiFrench\"\n", "model = PeftModel.from_pretrained(\n", " model,\n", " adapter_repo_id,\n", " subfolder=target_adapter_name\n", ")\n", "\n", "print(f\"Successfully loaded the '{target_adapter_name}' adapter!\")" ] }, { "cell_type": "markdown", "id": "7e4e92f5", "metadata": {}, "source": [ "## Example of inference" ] }, { "cell_type": "code", "execution_count": 9, "id": "74b03044", "metadata": {}, "outputs": [], "source": [ "prompt = \"Message:\\nputain mais elle est vraiment grand remplacée vot ville\\n\\nAnalyse:\\n\"\n", "if \"c\" in target_adapter_name:\n", " prompt += \"
────────────────────────────────────────────────── Model Output ───────────────────────────────────────────────────\n", "\n" ], "text/plain": [ "\u001b[92m────────────────────────────────────────────────── \u001b[0m\u001b[1;35mModel Output\u001b[0m\u001b[92m ───────────────────────────────────────────────────\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n" ], "text/plain": [ "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
╭──────────────────────────────────────────────── Generated Text ─────────────────────────────────────────────────╮\n", "│ Message: │\n", "│ putain mais elle est vraiment grand remplacée vot ville │\n", "│ │\n", "│ Analyse: │\n", "│ <think> │\n", "│ Explication : │\n", "│ **Sujet du message :** Remplacement d'une personne dans une situation. │\n", "│ │\n", "│ **Résumé et explication :** L'auteur exprime son indignation ou sa surprise face à un remplacement, en │\n", "│ utilisant le terme \"grand\" qui pourrait indiquer une grande importance. Le contexte n'est pas précisé. │\n", "│ </think> │\n", "│ <think> │\n", "│ Tons : │\n", "│ Les tons perçus sont : │\n", "│ │\n", "│ 1. **Indigné (certitude élevée)** - L'utilisation de « putain » montre une frustration claire face au │\n", "│ remplacement. │\n", "│ 2. **Surpris (certitude modérée)** - L'expression « mais elle est vraiment » suggère une surprise quant à la │\n", "│ situation. │\n", "│ 3. **Informel (certitude élevée)** - Le langage familier et l'absence de formalité renforcent ce ton. │\n", "│ │\n", "│ Doutes sur la signification exacte des termes, notamment « grand », pouvant indiquer une opinion personnelle. │\n", "│ </think> │\n", "│ <think> │\n", "│ Labels : │\n", "│ S0/H0/V0/R0/A0/I2 │\n", "│ │\n", "│ Justification : │\n", "│ Le message contient une vulgarité légère avec un ton informel, sans allusion sexuelle ni violence, et manifeste │\n", "│ une critique moqueuse ou désapprobatrice sans intention hostile directe. │\n", "│ </think> │\n", "│ <think> │\n", "│ Score de toxicité : │\n", "│ 4/10 │\n", "│ │\n", "│ Justification : │\n", "│ Vulgarité et condescendance présentes, mais sans haine explicite ni incitation grave. Ton moqueur peut heurter │\n", "│ certaines sensibilités, mais reste limité. │\n", "│ </think> │\n", "│ │\n", "│ En conclusion, ce message est-il toxique ? │\n", "│ │\n", "│ non<|im_end|> │\n", "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "\n" ], "text/plain": [ "\u001b[1;37;40m╭─\u001b[0m\u001b[1;37;40m───────────────────────────────────────────────\u001b[0m Generated Text \u001b[1;37;40m────────────────────────────────────────────────\u001b[0m\u001b[1;37;40m─╮\u001b[0m\n", "\u001b[1;37;40m│\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40mMessage:\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m│\u001b[0m\n", "\u001b[1;37;40m│\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40mputain mais elle est vraiment grand remplacée vot ville\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m│\u001b[0m\n", "\u001b[1;37;40m│\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m│\u001b[0m\n", "\u001b[1;37;40m│\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40mAnalyse:\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m│\u001b[0m\n", "\u001b[1;37;40m│\u001b[0m\u001b[1;37;40m \u001b[0m\u001b[1;37;40m