{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Add Additional Datasets to The Model." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "# from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "import tensorflow as tf\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
commenttoxicsevere_toxicobscene threatinsultidentity_hate
0Huyu msee ni malaya1001.00
1Zakayo ni mbwa0001.00
2Chira ni shoga1101.00
3Akinirushia mamdomo mi ntamuua1110.00
4Haga sana0001.00
5Kuma ya mamako1101.00
6Wewe ni mjinga sana!1001.00
7Nataka nikuchape vibaya!0100.00
8Huo ni upuuzi mtupu!1000.00
9Wewe ni takataka!0000.01
\n", "
" ], "text/plain": [ " comment toxic severe_toxic obscene threat \\\n", "0 Huyu msee ni malaya 1 0 0 \n", "1 Zakayo ni mbwa 0 0 0 \n", "2 Chira ni shoga 1 1 0 \n", "3 Akinirushia mamdomo mi ntamuua 1 1 1 \n", "4 Haga sana 0 0 0 \n", "5 Kuma ya mamako 1 1 0 \n", "6 Wewe ni mjinga sana! 1 0 0 \n", "7 Nataka nikuchape vibaya! 0 1 0 \n", "8 Huo ni upuuzi mtupu! 1 0 0 \n", "9 Wewe ni takataka! 0 0 0 \n", "\n", " insult identity_hate \n", "0 1.0 0 \n", "1 1.0 0 \n", "2 1.0 0 \n", "3 0.0 0 \n", "4 1.0 0 \n", "5 1.0 0 \n", "6 1.0 0 \n", "7 0.0 0 \n", "8 0.0 0 \n", "9 0.0 1 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe = pd.read_csv('swahili.csv')\n", "\n", "texts = dataframe['comment'].values\n", "labels = dataframe[['toxic', 'severe_toxic', 'obscene threat', 'insult', 'identity_hate']].values\n", "\n", "dataframe.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Preprocess and Prepare Data for Training:\n", "max_len = 200\n", "\n", "# Load the tokenizer\n", "with open('tokenizer.pickle', 'rb') as handle:\n", " tokenizer = pickle.load(handle)\n", "\n", "# Pad & Tokenize Data\n", "sequences = tokenizer.texts_to_sequences(texts)\n", "padded_sequences = pad_sequences(sequences, maxlen=max_len)\n", "\n", "# Data spliting\n", "X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)\n", "\n", "# Create TensorFlow datasets\n", "train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)\n", "val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(32)\n" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }