diff --git "a/Movie Genre Prediction Code.ipynb" "b/Movie Genre Prediction Code.ipynb"
new file mode 100644--- /dev/null
+++ "b/Movie Genre Prediction Code.ipynb"
@@ -0,0 +1,2922 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Importing Libraries"
+ ],
+ "metadata": {
+ "id": "jiNlo56ax2Us"
+ },
+ "id": "jiNlo56ax2Us"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e0c77096",
+ "metadata": {
+ "id": "e0c77096"
+ },
+ "outputs": [],
+ "source": [
+ "# Importing required Libraries\n",
+ "\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.naive_bayes import MultinomialNB\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "b5fbe912",
+ "metadata": {
+ "scrolled": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "b5fbe912",
+ "outputId": "a8b49040-2f4b-429b-d174-aead00979ed5"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n",
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Downloading NLTK Packages\n",
+ "\n",
+ "import nltk\n",
+ "nltk.download('stopwords')\n",
+ "nltk.download('punkt')\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "fcb105bd",
+ "metadata": {
+ "id": "fcb105bd"
+ },
+ "outputs": [],
+ "source": [
+ "# Creating set of stop words\n",
+ "\n",
+ "stop_words = set(stopwords.words('english'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Importing and Pre-processing Training Dataset"
+ ],
+ "metadata": {
+ "id": "8mICKWhquvro"
+ },
+ "id": "8mICKWhquvro"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "f5965adb",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "f5965adb",
+ "outputId": "42ca0d77-46cb-4032-9071-37bad3d4ce61"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 44978 Super Me \n",
+ "1 50185 Entity Project \n",
+ "2 34131 Behavioral Family Therapy for Serious Psychiat... \n",
+ "3 78522 Blood Glacier \n",
+ "4 2206 Apat na anino \n",
+ "\n",
+ " synopsis genre \n",
+ "0 A young scriptwriter starts bringing valuable ... fantasy \n",
+ "1 A director and her friends renting a haunted h... horror \n",
+ "2 This is an educational video for families and ... family \n",
+ "3 Scientists working in the Austrian Alps discov... scifi \n",
+ "4 Buy Day - Four Men Widely - Apart in Life - By... action "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 44978 | \n",
+ " Super Me | \n",
+ " A young scriptwriter starts bringing valuable ... | \n",
+ " fantasy | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50185 | \n",
+ " Entity Project | \n",
+ " A director and her friends renting a haunted h... | \n",
+ " horror | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 34131 | \n",
+ " Behavioral Family Therapy for Serious Psychiat... | \n",
+ " This is an educational video for families and ... | \n",
+ " family | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 78522 | \n",
+ " Blood Glacier | \n",
+ " Scientists working in the Austrian Alps discov... | \n",
+ " scifi | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2206 | \n",
+ " Apat na anino | \n",
+ " Buy Day - Four Men Widely - Apart in Life - By... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ],
+ "source": [
+ "# Importing Tranining Dataset\n",
+ "\n",
+ "train_data = pd.read_csv(\"train.csv\")\n",
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "20d4b346",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "20d4b346",
+ "outputId": "7e787488-2023-4854-e1f3-e5e1d5e237c2"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 54000 entries, 0 to 53999\n",
+ "Data columns (total 4 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 54000 non-null int64 \n",
+ " 1 movie_name 54000 non-null object\n",
+ " 2 synopsis 54000 non-null object\n",
+ " 3 genre 54000 non-null object\n",
+ "dtypes: int64(1), object(3)\n",
+ "memory usage: 1.6+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Getting Info about Train Data\n",
+ "\n",
+ "train_data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "fc1a181f",
+ "metadata": {
+ "scrolled": false,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "fc1a181f",
+ "outputId": "594fa4a9-9456-489c-eea1-4e5b160f850f"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "movie_name 0\n",
+ "synopsis 0\n",
+ "genre 0\n",
+ "dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "# Checking for Null Values\n",
+ "\n",
+ "train_data.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "ccbc40da",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ccbc40da",
+ "outputId": "27766f2e-df95-49b1-c6bf-dbf281b3bf21"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "fantasy 5400\n",
+ "horror 5400\n",
+ "family 5400\n",
+ "scifi 5400\n",
+ "action 5400\n",
+ "crime 5400\n",
+ "adventure 5400\n",
+ "mystery 5400\n",
+ "romance 5400\n",
+ "thriller 5400\n",
+ "Name: genre, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "# Getting Number of Classes and their Distribution in Train Data\n",
+ "\n",
+ "train_data['genre'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "b630de43",
+ "metadata": {
+ "id": "b630de43"
+ },
+ "outputs": [],
+ "source": [
+ "# Method to pre-process text from column: movie_name\n",
+ "\n",
+ "def preprocessMovieName(movieNames):\n",
+ " \"\"\"\n",
+ " Converting text to lowercase and Removing extra spaces from movie_name column values\n",
+ " \"\"\"\n",
+ " cleanedMovieNames = []\n",
+ "\n",
+ " for movie in movieNames:\n",
+ " text = movie.lower()\n",
+ " text = text.strip(' ')\n",
+ " cleanedMovieNames.append(text)\n",
+ "\n",
+ " return cleanedMovieNames"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "6bc44949",
+ "metadata": {
+ "scrolled": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "6bc44949",
+ "outputId": "8765cf23-b184-4c66-fe6b-23709fcaa5c8"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 44978 super me \n",
+ "1 50185 entity project \n",
+ "2 34131 behavioral family therapy for serious psychiat... \n",
+ "3 78522 blood glacier \n",
+ "4 2206 apat na anino \n",
+ "\n",
+ " synopsis genre \n",
+ "0 A young scriptwriter starts bringing valuable ... fantasy \n",
+ "1 A director and her friends renting a haunted h... horror \n",
+ "2 This is an educational video for families and ... family \n",
+ "3 Scientists working in the Austrian Alps discov... scifi \n",
+ "4 Buy Day - Four Men Widely - Apart in Life - By... action "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 44978 | \n",
+ " super me | \n",
+ " A young scriptwriter starts bringing valuable ... | \n",
+ " fantasy | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50185 | \n",
+ " entity project | \n",
+ " A director and her friends renting a haunted h... | \n",
+ " horror | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 34131 | \n",
+ " behavioral family therapy for serious psychiat... | \n",
+ " This is an educational video for families and ... | \n",
+ " family | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 78522 | \n",
+ " blood glacier | \n",
+ " Scientists working in the Austrian Alps discov... | \n",
+ " scifi | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2206 | \n",
+ " apat na anino | \n",
+ " Buy Day - Four Men Widely - Apart in Life - By... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "source": [
+ "# Transforming movie_name column using preprocessMovieName method\n",
+ "\n",
+ "movieNames = train_data['movie_name']\n",
+ "train_data['movie_name'] = preprocessMovieName(movieNames)\n",
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "bebcffc9",
+ "metadata": {
+ "id": "bebcffc9"
+ },
+ "outputs": [],
+ "source": [
+ "# Method to pre-process text from column: synopsis\n",
+ "\n",
+ "def preprocessSynopsis(synopsis):\n",
+ " \"\"\"\n",
+ " Converting text to lowercase, Removing extra spaces, digits, symbols, stop words from synopsis column values\n",
+ " \"\"\"\n",
+ " cleanedSynopses = []\n",
+ "\n",
+ " for synop in synopsis:\n",
+ " text = re.sub(r'[^a-zA-Z]', ' ', synop.lower())\n",
+ " text = text.strip(' ')\n",
+ " word_tokens = word_tokenize(text)\n",
+ " cleanedText = [w for w in word_tokens if w not in stop_words]\n",
+ " cleanedSynop = ' '.join(cleanedText)\n",
+ " cleanedSynopses.append(cleanedSynop)\n",
+ "\n",
+ " return cleanedSynopses"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "5994c4ee",
+ "metadata": {
+ "scrolled": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "5994c4ee",
+ "outputId": "84e4f903-491b-4313-c7b0-287b0748e2bb"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 44978 super me \n",
+ "1 50185 entity project \n",
+ "2 34131 behavioral family therapy for serious psychiat... \n",
+ "3 78522 blood glacier \n",
+ "4 2206 apat na anino \n",
+ "\n",
+ " synopsis genre \n",
+ "0 young scriptwriter starts bringing valuable ob... fantasy \n",
+ "1 director friends renting haunted house capture... horror \n",
+ "2 educational video families family therapists d... family \n",
+ "3 scientists working austrian alps discover glac... scifi \n",
+ "4 buy day four men widely apart life night shado... action "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 44978 | \n",
+ " super me | \n",
+ " young scriptwriter starts bringing valuable ob... | \n",
+ " fantasy | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50185 | \n",
+ " entity project | \n",
+ " director friends renting haunted house capture... | \n",
+ " horror | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 34131 | \n",
+ " behavioral family therapy for serious psychiat... | \n",
+ " educational video families family therapists d... | \n",
+ " family | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 78522 | \n",
+ " blood glacier | \n",
+ " scientists working austrian alps discover glac... | \n",
+ " scifi | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2206 | \n",
+ " apat na anino | \n",
+ " buy day four men widely apart life night shado... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ],
+ "source": [
+ "# Transforming synopsis column using preprocessSynopsis method\n",
+ "\n",
+ "synopsis = train_data['synopsis']\n",
+ "train_data['synopsis'] = preprocessSynopsis(synopsis)\n",
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Method to combine text values from movie_name and synopsis columns\n",
+ "\n",
+ "def mergeText(df):\n",
+ " \"\"\"\n",
+ " Combining text from movie_name and synopsis i.e. resulting values will be of the form: movie_name+' '+synopsis\n",
+ " \"\"\"\n",
+ " movieSynposis=[]\n",
+ "\n",
+ " for ind in df.index:\n",
+ " ms_text = str(df['movie_name'][ind]) + ' ' + str(df['synopsis'][ind])\n",
+ " movieSynposis.append(ms_text)\n",
+ "\n",
+ " return movieSynposis"
+ ],
+ "metadata": {
+ "id": "SuSa8M9yvemY"
+ },
+ "id": "SuSa8M9yvemY",
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Applying mergeText method and storing values in new column: movie_synopsis\n",
+ "\n",
+ "train_data['movie_synopsis'] = mergeText(train_data)\n",
+ "train_data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "6Ag-7TllvLm4",
+ "outputId": "2d851bd2-c64a-4a4a-f97b-a98f24df8ac9"
+ },
+ "id": "6Ag-7TllvLm4",
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 44978 super me \n",
+ "1 50185 entity project \n",
+ "2 34131 behavioral family therapy for serious psychiat... \n",
+ "3 78522 blood glacier \n",
+ "4 2206 apat na anino \n",
+ "\n",
+ " synopsis genre \\\n",
+ "0 young scriptwriter starts bringing valuable ob... fantasy \n",
+ "1 director friends renting haunted house capture... horror \n",
+ "2 educational video families family therapists d... family \n",
+ "3 scientists working austrian alps discover glac... scifi \n",
+ "4 buy day four men widely apart life night shado... action \n",
+ "\n",
+ " movie_synopsis \n",
+ "0 super me young scriptwriter starts bringing va... \n",
+ "1 entity project director friends renting haunte... \n",
+ "2 behavioral family therapy for serious psychiat... \n",
+ "3 blood glacier scientists working austrian alps... \n",
+ "4 apat na anino buy day four men widely apart li... "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ " movie_synopsis | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 44978 | \n",
+ " super me | \n",
+ " young scriptwriter starts bringing valuable ob... | \n",
+ " fantasy | \n",
+ " super me young scriptwriter starts bringing va... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50185 | \n",
+ " entity project | \n",
+ " director friends renting haunted house capture... | \n",
+ " horror | \n",
+ " entity project director friends renting haunte... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 34131 | \n",
+ " behavioral family therapy for serious psychiat... | \n",
+ " educational video families family therapists d... | \n",
+ " family | \n",
+ " behavioral family therapy for serious psychiat... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 78522 | \n",
+ " blood glacier | \n",
+ " scientists working austrian alps discover glac... | \n",
+ " scifi | \n",
+ " blood glacier scientists working austrian alps... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2206 | \n",
+ " apat na anino | \n",
+ " buy day four men widely apart life night shado... | \n",
+ " action | \n",
+ " apat na anino buy day four men widely apart li... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Label Encoding Target Classes"
+ ],
+ "metadata": {
+ "id": "YpVgEjifxrCB"
+ },
+ "id": "YpVgEjifxrCB"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Using Label Encoder to encode classes from genre\n",
+ "\n",
+ "le_genre = LabelEncoder()\n",
+ "train_data['genre'] = le_genre.fit_transform(train_data['genre'])\n",
+ "train_data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "iNAZim2poDBz",
+ "outputId": "65e6a4fb-8570-42c1-83cd-ac713e003657"
+ },
+ "id": "iNAZim2poDBz",
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 44978 super me \n",
+ "1 50185 entity project \n",
+ "2 34131 behavioral family therapy for serious psychiat... \n",
+ "3 78522 blood glacier \n",
+ "4 2206 apat na anino \n",
+ "\n",
+ " synopsis genre \\\n",
+ "0 young scriptwriter starts bringing valuable ob... 4 \n",
+ "1 director friends renting haunted house capture... 5 \n",
+ "2 educational video families family therapists d... 3 \n",
+ "3 scientists working austrian alps discover glac... 8 \n",
+ "4 buy day four men widely apart life night shado... 0 \n",
+ "\n",
+ " movie_synopsis \n",
+ "0 super me young scriptwriter starts bringing va... \n",
+ "1 entity project director friends renting haunte... \n",
+ "2 behavioral family therapy for serious psychiat... \n",
+ "3 blood glacier scientists working austrian alps... \n",
+ "4 apat na anino buy day four men widely apart li... "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ " movie_synopsis | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 44978 | \n",
+ " super me | \n",
+ " young scriptwriter starts bringing valuable ob... | \n",
+ " 4 | \n",
+ " super me young scriptwriter starts bringing va... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50185 | \n",
+ " entity project | \n",
+ " director friends renting haunted house capture... | \n",
+ " 5 | \n",
+ " entity project director friends renting haunte... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 34131 | \n",
+ " behavioral family therapy for serious psychiat... | \n",
+ " educational video families family therapists d... | \n",
+ " 3 | \n",
+ " behavioral family therapy for serious psychiat... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 78522 | \n",
+ " blood glacier | \n",
+ " scientists working austrian alps discover glac... | \n",
+ " 8 | \n",
+ " blood glacier scientists working austrian alps... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2206 | \n",
+ " apat na anino | \n",
+ " buy day four men widely apart life night shado... | \n",
+ " 0 | \n",
+ " apat na anino buy day four men widely apart li... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Retrieving list of classes from Label Encoder\n",
+ "\n",
+ "le_genre.classes_"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "jU8Xu7PmoFc1",
+ "outputId": "3e739ff8-6733-4985-c15e-bfb081c32802"
+ },
+ "id": "jU8Xu7PmoFc1",
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['action', 'adventure', 'crime', 'family', 'fantasy', 'horror',\n",
+ " 'mystery', 'romance', 'scifi', 'thriller'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Vectorizing Textual Data"
+ ],
+ "metadata": {
+ "id": "9ha441zmxtih"
+ },
+ "id": "9ha441zmxtih"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Vectorizing textual data i.e. converting each text token into integers using TF-IDF Vectorizer\n",
+ "\n",
+ "cv = TfidfVectorizer()\n",
+ "vectorized_synopsis = cv.fit_transform(train_data['movie_synopsis'])\n",
+ "vectorized_synopsis[0]"
+ ],
+ "metadata": {
+ "id": "pcC4sDFUbuDv",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "d37267d4-441d-4bed-8aa3-1973e200b5d2"
+ },
+ "id": "pcC4sDFUbuDv",
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "<1x60085 sparse matrix of type ''\n",
+ "\twith 16 stored elements in Compressed Sparse Row format>"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Separating X: Features and Y: Target columns\n",
+ "\n",
+ "X = vectorized_synopsis\n",
+ "Y = train_data['genre'].values\n",
+ "\n",
+ "print(\"Features Shape: \",X.shape)\n",
+ "print(\"Target Shape: \",Y.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5n0JM8UHmq5s",
+ "outputId": "77361117-be79-4e7a-928d-ed78c4cda003"
+ },
+ "id": "5n0JM8UHmq5s",
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Features Shape: (54000, 60085)\n",
+ "Target Shape: (54000,)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Splitting data into Train and Validation Sets"
+ ],
+ "metadata": {
+ "id": "gs1QN3aUx-Jj"
+ },
+ "id": "gs1QN3aUx-Jj"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "7d5005b3",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7d5005b3",
+ "outputId": "ede369e4-af55-467f-8525-20a1ed145615"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "<1x60085 sparse matrix of type ''\n",
+ "\twith 25 stored elements in Compressed Sparse Row format>"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ],
+ "source": [
+ "# Splitting into Training and Validation Sets with 25% validation split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)\n",
+ "X_train[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Model Building: Training, Prediction and Metric Evaluation"
+ ],
+ "metadata": {
+ "id": "XXWXGR_iyH-G"
+ },
+ "id": "XXWXGR_iyH-G"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Training model using Multinomial Naive Bayes, Getting predictions on Validation set, Calculating Metric: Accuracy\n",
+ "\n",
+ "from sklearn.naive_bayes import MultinomialNB\n",
+ "\n",
+ "mnb = MultinomialNB()\n",
+ "\n",
+ "mnb.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = mnb.predict(X_test)\n",
+ "\n",
+ "print(\"Val Acc using MultinomialNB: \", accuracy_score(y_test, y_pred))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bJXTw8qxOEIV",
+ "outputId": "0ee8688b-60a0-42a8-d657-0327475f439c"
+ },
+ "id": "bJXTw8qxOEIV",
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Val Acc using MultinomialNB: 0.3622222222222222\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Training model using Decision Tree Classifier, Getting predictions on Validation set, Calculating Metric: Accuracy\n",
+ "\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "\n",
+ "dt_clf = DecisionTreeClassifier()\n",
+ "\n",
+ "dt_clf.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = dt_clf.predict(X_test)\n",
+ "\n",
+ "print(\"Val Acc using Decision Tree: \", accuracy_score(y_test, y_pred))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1ytk2S35RooF",
+ "outputId": "6b9f3f83-c832-4f10-edc5-a56ee6f62d7c"
+ },
+ "id": "1ytk2S35RooF",
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Val Acc using Decision Tree: 0.18748148148148147\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Training model using KNN (K-Nearest Neighbours Classifier), Getting predictions on Validation set, Calculating Metric: Accuracy\n",
+ "\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "\n",
+ "knn = KNeighborsClassifier(n_neighbors=7)\n",
+ "\n",
+ "knn.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = knn.predict(X_test)\n",
+ "\n",
+ "print(\"Val Acc using KNN: \", accuracy_score(y_test, y_pred))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6MOxLHJVe9um",
+ "outputId": "c3b8154c-2923-4a34-d474-0d8796d3c957"
+ },
+ "id": "6MOxLHJVe9um",
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Val Acc using KNN: 0.23837037037037037\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "As our principle metric to consider is Accuracy, we finalize Multinomial Naive Bayes as our Final Model.
\n",
+ "Multinomial Naives Bayes outperforms among all the considered models, hence using it for Test Data Prediction."
+ ],
+ "metadata": {
+ "id": "-5VNI6OVyVXW"
+ },
+ "id": "-5VNI6OVyVXW"
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Test Data Prediction"
+ ],
+ "metadata": {
+ "id": "6hZGPKFgT5X_"
+ },
+ "id": "6hZGPKFgT5X_"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "test_data = pd.read_csv(\"test.csv\")\n",
+ "test_data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "kKZySdPbT04P",
+ "outputId": "b525b13a-4f69-4eba-faaf-f1390788bfc8"
+ },
+ "id": "kKZySdPbT04P",
+ "execution_count": 22,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 16863 A Death Sentence \n",
+ "1 48456 Intermedio \n",
+ "2 41383 30 Chua Phai Tet \n",
+ "3 84007 Paranoiac \n",
+ "4 40269 Ordinary Happiness \n",
+ "\n",
+ " synopsis genre \n",
+ "0 12 y.o. Ida's dad'll die without a DKK1,500,00... action \n",
+ "1 A group of four teenage friends become trapped... action \n",
+ "2 A guy left his home for 12 years till he came ... action \n",
+ "3 A man long believed dead returns to the family... action \n",
+ "4 After a deadly accident, Paolo comes back on E... action "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 16863 | \n",
+ " A Death Sentence | \n",
+ " 12 y.o. Ida's dad'll die without a DKK1,500,00... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 48456 | \n",
+ " Intermedio | \n",
+ " A group of four teenage friends become trapped... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41383 | \n",
+ " 30 Chua Phai Tet | \n",
+ " A guy left his home for 12 years till he came ... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 84007 | \n",
+ " Paranoiac | \n",
+ " A man long believed dead returns to the family... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 40269 | \n",
+ " Ordinary Happiness | \n",
+ " After a deadly accident, Paolo comes back on E... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 22
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movieNames = test_data['movie_name']\n",
+ "test_data['movie_name'] = preprocessMovieName(movieNames)\n",
+ "\n",
+ "synopsis = test_data['synopsis']\n",
+ "test_data['synopsis'] = preprocessSynopsis(synopsis)\n",
+ "\n",
+ "test_data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "0_bFugnGUsux",
+ "outputId": "3014bb16-d73b-4bc8-9216-e7b812966f50"
+ },
+ "id": "0_bFugnGUsux",
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 16863 a death sentence \n",
+ "1 48456 intermedio \n",
+ "2 41383 30 chua phai tet \n",
+ "3 84007 paranoiac \n",
+ "4 40269 ordinary happiness \n",
+ "\n",
+ " synopsis genre \n",
+ "0 ida dad die without dkk operation ida plans st... action \n",
+ "1 group four teenage friends become trapped mexi... action \n",
+ "2 guy left home years till came back claim fathe... action \n",
+ "3 man long believed dead returns family estate c... action \n",
+ "4 deadly accident paolo comes back earth minutes... action "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 16863 | \n",
+ " a death sentence | \n",
+ " ida dad die without dkk operation ida plans st... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 48456 | \n",
+ " intermedio | \n",
+ " group four teenage friends become trapped mexi... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41383 | \n",
+ " 30 chua phai tet | \n",
+ " guy left home years till came back claim fathe... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 84007 | \n",
+ " paranoiac | \n",
+ " man long believed dead returns family estate c... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 40269 | \n",
+ " ordinary happiness | \n",
+ " deadly accident paolo comes back earth minutes... | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "test_data['movie_synopsis'] = mergeText(test_data)\n",
+ "test_data.drop(['genre'], axis=1, inplace=True)\n",
+ "\n",
+ "test_data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "Jq_alTF3USTX",
+ "outputId": "30568322-7d54-4a0e-9934-289f892f0b4f"
+ },
+ "id": "Jq_alTF3USTX",
+ "execution_count": 24,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id movie_name \\\n",
+ "0 16863 a death sentence \n",
+ "1 48456 intermedio \n",
+ "2 41383 30 chua phai tet \n",
+ "3 84007 paranoiac \n",
+ "4 40269 ordinary happiness \n",
+ "\n",
+ " synopsis \\\n",
+ "0 ida dad die without dkk operation ida plans st... \n",
+ "1 group four teenage friends become trapped mexi... \n",
+ "2 guy left home years till came back claim fathe... \n",
+ "3 man long believed dead returns family estate c... \n",
+ "4 deadly accident paolo comes back earth minutes... \n",
+ "\n",
+ " movie_synopsis \n",
+ "0 a death sentence ida dad die without dkk opera... \n",
+ "1 intermedio group four teenage friends become t... \n",
+ "2 30 chua phai tet guy left home years till came... \n",
+ "3 paranoiac man long believed dead returns famil... \n",
+ "4 ordinary happiness deadly accident paolo comes... "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " movie_name | \n",
+ " synopsis | \n",
+ " movie_synopsis | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 16863 | \n",
+ " a death sentence | \n",
+ " ida dad die without dkk operation ida plans st... | \n",
+ " a death sentence ida dad die without dkk opera... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 48456 | \n",
+ " intermedio | \n",
+ " group four teenage friends become trapped mexi... | \n",
+ " intermedio group four teenage friends become t... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41383 | \n",
+ " 30 chua phai tet | \n",
+ " guy left home years till came back claim fathe... | \n",
+ " 30 chua phai tet guy left home years till came... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 84007 | \n",
+ " paranoiac | \n",
+ " man long believed dead returns family estate c... | \n",
+ " paranoiac man long believed dead returns famil... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 40269 | \n",
+ " ordinary happiness | \n",
+ " deadly accident paolo comes back earth minutes... | \n",
+ " ordinary happiness deadly accident paolo comes... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "vectorized_synopsis = cv.transform(test_data['movie_synopsis'])\n",
+ "\n",
+ "predictions = mnb.predict(vectorized_synopsis)\n",
+ "\n",
+ "genre_predictions = le_genre.inverse_transform(predictions)"
+ ],
+ "metadata": {
+ "id": "8yeCDG6kUifi"
+ },
+ "id": "8yeCDG6kUifi",
+ "execution_count": 25,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "submission = pd.DataFrame(pd.DataFrame({'id': test_data['id'], 'genre': genre_predictions}))\n",
+ "submission.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "Azmt1BZsVVgC",
+ "outputId": "a731ee69-6365-4537-e7ea-541a4962ab3e"
+ },
+ "id": "Azmt1BZsVVgC",
+ "execution_count": 26,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id genre\n",
+ "0 16863 crime\n",
+ "1 48456 horror\n",
+ "2 41383 scifi\n",
+ "3 84007 mystery\n",
+ "4 40269 fantasy"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 16863 | \n",
+ " crime | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 48456 | \n",
+ " horror | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41383 | \n",
+ " scifi | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 84007 | \n",
+ " mystery | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 40269 | \n",
+ " fantasy | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 26
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "submission.to_csv('submission_ShalakaThorat.csv', index=False)"
+ ],
+ "metadata": {
+ "id": "u1kZLHbBWO0d"
+ },
+ "id": "u1kZLHbBWO0d",
+ "execution_count": 27,
+ "outputs": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ },
+ "colab": {
+ "provenance": [],
+ "gpuType": "T4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file