{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "519048d9-fd5b-4115-9439-60995405dfb9", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "d59a42cc-733c-4a71-8ef9-698ab3f1b163", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 3, "id": "5477a796-7bde-43c4-9e88-6016f7597463", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /Users/Pi/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "import nltk\n", "from nltk.corpus import stopwords\n", "nltk.download('stopwords')\n", "# Tokenization and Stopword removal\n", "stop_words = set(stopwords.words('english'))" ] }, { "cell_type": "code", "execution_count": 4, "id": "57a774b8-02cc-4be1-bfd4-a083efa1e27f", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split, learning_curve\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import classification_report, confusion_matrix\n" ] }, { "cell_type": "markdown", "id": "96e7dac5-f335-463f-9f15-94c7ea22404e", "metadata": {}, "source": [ "# 1. Data Cleaning and Preprocessing" ] }, { "cell_type": "markdown", "id": "79d1824d-7251-450a-a8e3-b3c84a55797f", "metadata": {}, "source": [ "# Load dataset " ] }, { "cell_type": "code", "execution_count": 5, "id": "eca0d147-453f-4223-9665-b30235699960", "metadata": {}, "outputs": [], "source": [ "# Load the dataset\n", "df = pd.read_csv(\"dataset/Kaggle_Mental_Health_Conversations_train.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "68571d93-c7a7-46b9-81f5-aea6a2ccf909", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 3512 entries, 0 to 3511\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Context 3512 non-null object\n", " 1 Response 3508 non-null object\n", "dtypes: object(2)\n", "memory usage: 55.0+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 7, "id": "2ef75f44-cf14-44b3-b4a2-7877a5c067d5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ContextResponse
count35123508
unique9952479
topI have so many issues to address. I have a his...It's normal to feel a little anxiety--after al...
freq943
\n", "
" ], "text/plain": [ " Context \\\n", "count 3512 \n", "unique 995 \n", "top I have so many issues to address. I have a his... \n", "freq 94 \n", "\n", " Response \n", "count 3508 \n", "unique 2479 \n", "top It's normal to feel a little anxiety--after al... \n", "freq 3 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 8, "id": "550ef6d6-f35c-4f34-ad60-a3956d44a1d9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Context 0\n", "Response 4\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "markdown", "id": "d2aaff7e-37fb-4ab7-ac12-0d71d979b3a1", "metadata": {}, "source": [ "There are total 3508 rows, and among 3508 responses, there are 4 empty responses. It is fine to remove them. You can also make the response to be \"unknown\". " ] }, { "cell_type": "code", "execution_count": 9, "id": "5bc876e5-1367-4fd5-9b3f-0daad754ff64", "metadata": {}, "outputs": [], "source": [ "# Drop missing or null entries\n", "df.dropna(subset=[\"Context\", \"Response\"], inplace=True)\n", "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 10, "id": "b33c1b3c-b54b-45c2-ac1c-1a543890277f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique context ratio: 28.36%\n", "Unique response ratio: 70.67%\n" ] } ], "source": [ "repeated_context = df.Context.nunique()/len(df) * 100 \n", "repeated_response = df.Response.nunique()/len(df) * 100\n", "\n", "print(f\"Unique context ratio: {repeated_context:.2f}%\")\n", "print(f\"Unique response ratio: {repeated_response:.2f}%\")" ] }, { "cell_type": "markdown", "id": "d19ea384-680c-429b-9172-71255d35f26c", "metadata": {}, "source": [ "There are repeated contexts and repeated responses." ] }, { "cell_type": "markdown", "id": "65027ce2-f590-4207-9719-b9bae2c283e5", "metadata": {}, "source": [ "# Clean text " ] }, { "cell_type": "code", "execution_count": 11, "id": "868fc2fe-6eaf-473c-9082-2e263a6acb75", "metadata": {}, "outputs": [], "source": [ "def clean_text(text):\n", " text = re.sub(r\"http\\S+|www\\S+|https\\S+\", '', text, flags=re.MULTILINE) # remove URLs\n", " text = re.sub(r'\\@w+|\\#','', text) # remove @mentions and hashtags\n", " text = re.sub(r'[^A-Za-z\\s]', '', text) # remove non-alphabetic characters\n", " text = text.lower() # convert to lowercase\n", " return text\n", "\n", "# Apply cleaning\n", "df['clean_context'] = df['Context'].apply(clean_text)\n", "df['clean_response'] = df['Response'].fillna(\"\").apply(clean_text)" ] }, { "cell_type": "markdown", "id": "7caeb03d-4a47-41d9-b533-ebf116124366", "metadata": {}, "source": [ "# Check the length of text" ] }, { "cell_type": "code", "execution_count": 12, "id": "93882073-413e-41ff-8781-461aaaa2c057", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Context length stats:\n", " count 3508.000000\n", "mean 55.006271\n", "std 48.169912\n", "min 4.000000\n", "25% 28.000000\n", "50% 45.000000\n", "75% 68.000000\n", "max 526.000000\n", "Name: context_len, dtype: float64\n", "\n", "Response length stats:\n", " count 3508.000000\n", "mean 176.625428\n", "std 120.175634\n", "min 0.000000\n", "25% 93.000000\n", "50% 144.000000\n", "75% 221.000000\n", "max 939.000000\n", "Name: response_len, dtype: float64\n", "\n", "Combined message length stats:\n", " count 3508.000000\n", "mean 231.631699\n", "std 133.446614\n", "min 5.000000\n", "25% 137.000000\n", "50% 201.000000\n", "75% 287.000000\n", "max 996.000000\n", "dtype: float64\n" ] } ], "source": [ "# # Tokenize for basic statistics\n", "df['context_len'] = df['clean_context'].apply(lambda x: len(str(x).split()))\n", "df['response_len'] = df['clean_response'].apply(lambda x: len(str(x).split()))\n", "\n", "# Descriptive statistics\n", "context_stats = df['context_len'].describe()\n", "response_stats = df['response_len'].describe()\n", "combined_stats = (df['context_len'] + df['response_len']).describe()\n", "\n", "print(\"Context length stats:\\n\", context_stats)\n", "print(\"\\nResponse length stats:\\n\", response_stats)\n", "print(\"\\nCombined message length stats:\\n\", combined_stats)" ] }, { "cell_type": "code", "execution_count": 13, "id": "ebe22297-57a7-49da-93dd-e5833a8570e2", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(14, 6))\n", "\n", "# First subplot: Context length\n", "plt.subplot(1, 2, 1)\n", "sns.histplot(df['context_len'], bins=50, color='dodgerblue')\n", "plt.title(\"User Message Length Distribution\")\n", "plt.xlabel(\"Words in Context\")\n", "plt.ylabel(\"Frequency\")\n", "\n", "# Second subplot: Response length\n", "plt.subplot(1, 2, 2)\n", "sns.histplot(df['response_len'], bins=50, color='seagreen')\n", "plt.title(\"Response Length Distribution\")\n", "plt.xlabel(\"Words in Response\")\n", "plt.ylabel(\"Frequency\")\n", "\n", "plt.tight_layout()\n", "plt.savefig(\"length_distributions.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 14, "id": "db145d1d-86a9-43fc-99a2-cb4108114923", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Top 5 longest user inputs:\n", " clean_context context_len\n", "1014 im a teenager and throughout my entire life iv... 526\n", "1643 every once and a while i think about my exboyf... 524\n", "2308 every once and a while i think about my exboyf... 524\n", "2444 my boyfriend and i have been together for five... 478\n", "1023 my boyfriend and i have been together for five... 478\n", "\n", "Top 5 longest responses:\n", " clean_response response_len\n", "815 previous counselors have discussed very good p... 939\n", "897 hello that must be very frustrating for you to... 926\n", "3142 hello that must be very frustrating for you to... 926\n", "3185 i think there are many different directions we... 907\n", "1122 i think there are many different directions we... 907\n", "\n", "Found 8 responses with <10 words (may be low quality).\n" ] } ], "source": [ "# Longest messages\n", "print(\"\\nTop 5 longest user inputs:\\n\", df[['clean_context', 'context_len']].sort_values(by='context_len', ascending=False).head())\n", "print(\"\\nTop 5 longest responses:\\n\", df[['clean_response', 'response_len']].sort_values(by='response_len', ascending=False).head())\n", "\n", "# Short response examples\n", "short_responses = df[df['response_len'] < 10]\n", "print(f\"\\nFound {len(short_responses)} responses with <10 words (may be low quality).\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "a64aeeef-4b1c-43f8-8a52-eb1bed75e667", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ContextResponseclean_contextclean_responsecontext_lenresponse_len
385I've gone to a couple therapy sessions so far ...Certainly.ive gone to a couple therapy sessions so far a...certainly301
1385It's the way my mom said I was worth nothing, ...There is nothing wrong with going to summer sc...its the way my mom said i was worth nothing st...there is nothing wrong with going to summer sc...219
2079I've gone to a couple therapy sessions so far ...Certainly.ive gone to a couple therapy sessions so far a...certainly301
2624such as not enough sleep0such as not enough sleep50
2695Can a counselor take sides with one parent and...<!--[if gte mso 9]>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\...can a counselor take sides with one parent and...if gte mso \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\...406
3080It's the way my mom said I was worth nothing, ...There is nothing wrong with going to summer sc...its the way my mom said i was worth nothing st...there is nothing wrong with going to summer sc...219
3293I've gone to a couple therapy sessions so far ...Certainly.ive gone to a couple therapy sessions so far a...certainly301
3507I just took a job that requires me to travel f...hmm this is a tough one!i just took a job that requires me to travel f...hmm this is a tough one506
\n", "
" ], "text/plain": [ " Context \\\n", "385 I've gone to a couple therapy sessions so far ... \n", "1385 It's the way my mom said I was worth nothing, ... \n", "2079 I've gone to a couple therapy sessions so far ... \n", "2624 such as not enough sleep \n", "2695 Can a counselor take sides with one parent and... \n", "3080 It's the way my mom said I was worth nothing, ... \n", "3293 I've gone to a couple therapy sessions so far ... \n", "3507 I just took a job that requires me to travel f... \n", "\n", " Response \\\n", "385 Certainly. \n", "1385 There is nothing wrong with going to summer sc... \n", "2079 Certainly. \n", "2624 0 \n", "2695