{ "cells": [ { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from os import listdir\n", "from os.path import isfile, join\n" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "# set path and load data\n", "path = '/Users/lsacy/data/HOPE_WSDM_2022/Train/'\n", "path2 = '/Users/lsacy/data/HOPE_WSDM_2022/Test/'\n", "path3 = '/Users/lsacy/data/HOPE_WSDM_2022/Validation/'\n", "\n", "\n", "files = [f for f in listdir(path) if isfile(join(path, f))]\n", "files2 = [f for f in listdir(path2) if isfile(join(path2, f))]\n", "files3 = [f for f in listdir(path3) if isfile(join(path3, f))]\n", "\n", "df_list = []\n", "for file in files:\n", " df = pd.read_csv(path+file)\n", " df_list.append(df)\n", "\n", "for file in files2:\n", " df = pd.read_csv(path2+file)\n", " df_list.append(df)\n", "\n", "for file in files3:\n", " df= pd.read_csv(path3+file)\n", " df_list.append(df)\n", " " ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "# do basic cleaning (removing NaNs, etc.)\n", "\n", "# remove 'Unnamed: 0' columns\n", "for df in df_list:\n", " try: \n", " df.drop(['Unnamed: 0'], axis=1, inplace=True)\n", " except:\n", " pass\n", "\n", "for df in df_list:\n", " try: \n", " df.drop(['Unnamed: 0.1'], axis=1, inplace=True)\n", " except:\n", " pass\n", "\n", "for df in df_list:\n", " try: \n", " df.drop(['Unnamed: 0.1.1'], axis=1, inplace=True)\n", " except:\n", " pass\n", "\n", "# find 'Type' == NaN\n", "for i in range(len(df_list)):\n", " if df_list[i]['Type'].isnull().any():\n", " print(i)\n", " \n", "# show the row number of Nan value in 'type'\n", "for i in range(len(df_list)):\n", " if df_list[i]['Type'].isnull().any():\n", " print(df_list[i][df_list[i]['Type'].isnull()])\n", "\n", "# Set the type of the row with NaN value to 'T'\n", "for i in range(len(df_list)):\n", " if df_list[i]['Type'].isnull().any():\n", " df_list[i].loc[df_list[i]['Type'].isnull(), 'Type'] = 'T'\n", " " ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [], "source": [ "# to get the data to prompt - completion format, we need to check for \n", "# continuaty of conversiontion using i == i.shift(1), print the index of the row, and i\n", "for i in range(len(df_list)):\n", " # check if df_list[i] the values of 'Type' column are switching between T and P for each row\n", " # if not, then print the index of the row, and i\n", " if (df_list[i]['Type'] == df_list[i]['Type'].shift(1)).any():\n", " print(i)\n", "\n", "# add a column 'key' to each df in df_list: identify the rows where 'Type' does not between T and P\n", "for i in range(len(df_list)):\n", " df_list[i]['key'] = (df_list[i]['Type'] != df_list[i]['Type'].shift(1)).astype(int).cumsum()\n", "\n", "# create a new df_list2, only keep the columns ['key', 'Type', 'Utterance']\n", "df_list2 = []\n", "for i in range(len(df_list)):\n", " df_temp = df_list[i][['key', 'Type', 'Utterance']]\n", " df_list2.append(df_temp)\n", " \n", "# apply groupby to each df in df_list to get the data in prompt - completion format\n", "for i in range(len(df_list2)):\n", " df_list2[i] = df_list2[i].groupby(['key', 'Type'])['Utterance'].apply(' '.join)" ] }, { "cell_type": "code", "execution_count": 233, "metadata": {}, "outputs": [], "source": [ "# convert series to frame\n", "for i in range(len(df_list2)):\n", " df_list2[i] = df_list2[i].to_frame()\n", "# drop the index\n", "for i in range(len(df_list2)):\n", " df_list2[i].reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 270, "metadata": {}, "outputs": [], "source": [ "# check again for swithing type occurences\n", "for i in range(len(df_list2)):\n", " # check if df_list[i] the values of 'Type' column are switching between T and P for each row\n", " # if not, then print the index of the row, and i\n", " if (df_list2[i]['Type'] == df_list2[i]['Type'].shift(1)).any():\n", " print(i)\n", " " ] }, { "cell_type": "code", "execution_count": 239, "metadata": {}, "outputs": [], "source": [ "# save list of patient start index\n", "patient_start=[]\n", "for i, df in enumerate(df_list2):\n", " if df.iloc[0]['Type'] == 'P':\n", " patient_start.append(i)\n", "\n", "# save list of therapist start index\n", "therapist_start = [i for i in range(len(df_list2)) if i not in patient_start]\n", "\n", "# create a new temporary dataframe with 2 columns: Type and Utterance\n", "df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n", "\n", "# create a list of dataframes, each dataframe is a conversation\n", "df_list3 = []\n", "for i in therapist_start:\n", " df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n", " for row in range(len(df_list2[i])):\n", " if df_list2[i]['Type'][row] == 'P' and (len(df_list2[i]) - 1) >= row+1:\n", " print(i, row)\n", " df_temp = df_temp.append({'prompt': df_list2[i]['Utterance'][row], 'completion': df_list2[i]['Utterance'][row+1]}, ignore_index=True)\n", " df_list3.append(df_temp)\n", "\n", "for i in patient_start:\n", " df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n", " for row in range(len(df_list2[i])):\n", " if df_list2[i]['Type'][row] == 'P' and (len(df_list2[i]) - 1) >= row+1:\n", " print(i, row)\n", " df_temp = df_temp.append({'prompt': df_list2[i]['Utterance'][row], 'completion': df_list2[i]['Utterance'][row+1]}, ignore_index=True)\n", " df_list3.append(df_temp)" ] }, { "cell_type": "code", "execution_count": 332, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Im here',\n", " \"So I've been made to come in and talk to you. because something happened on the bus. And there I may have said something to somebody and they're just like, oh, you can't say that. So they actually said, we're going to take you to jail. from jail. They just called me here.\",\n", " \"They didn't explain anything. The government never explains anything. They just don't want me to understand what's going on because they just want to keep me out of captivity.\",\n", " \"nope, it's the entire government, entire government, higher government.\",\n", " \"all my life. I've done this all my life. They they're always listening. They're always around. They're always there.\",\n", " \"always there. Like, for example, they're in the lights right now. They're listening to everything that we're talking about. So that's the reason why I really can't talk about what happened on the bus because they're just going to use that against and then it's locked me up again. I think we take more minutes.\",\n", " \"Yes, I take the medication, because if I don't take the medication, government comes and takes me away and puts me into a hospital where they force me the medication. And then the lights are always on and they can always hear everything that I'm saying and what I'm thinking and what I'm going to do.\",\n", " \"wanting us to take over the world, to take over the world, people in the world, of course, they want to use everything and use everything that everybody knows to use it against them so that they can actually have the world they want in that baby and make everybody pretend that they don't know what's going on. So therefore, they're using everybody's thoughts against themselves, so that the government can use it to their benefit.\",\n", " \"No. So when I'm home, I think I'm perfectly fine because I set up my house in my room to actually make it government proof so that nobody can actually hear see what's going on in there. So when I'm home, I'm perfectly fine.\",\n", " \"being home and being home I've saved my mom saved my brother's save everybody safe. And being out in the world in the world. That's a nobody safe.\",\n", " \"No, I told you they are employed, and they can hear everything that we're saying right now. So no, we're not safe here.\",\n", " 'at home and safe in my room, that I mean government.',\n", " \"I would stay home all the time, but I can't stay home because I mean to go and see doctors and talk to people and do the things that the government needs me or wants me to do. But I don't want to do those things because she's in my thoughts and they're trying to use everybody and get sleep. I can't deal with that.\",\n", " \"Everybody knows that they're in the light. It's common knowledge. It's common knowledge. Everybody knows that.\",\n", " \"No, of course, everybody hears the lights talking to you. I don't know what you're talking about. Like everybody hears it. Sometimes it's a whisper. For some people. Sometimes it's a yell. And if you want to not pay attention to what the lights are saying to you, then I don't know what's wrong with you, but everybody can hear it.\",\n", " 'It sounds like a robot talking. It sounds like we need to know this information we need to what do you do.',\n", " 'Yes. where your notes gonna go?',\n", " 'who has access to the chart room.',\n", " \"Can I get a copy of your notes after you're done?\",\n", " \"Yes, because I might trust you. And I don't know if I do yet. But I don't trust the other people who work here because I don't know if they're working for the government, which they probably are working for the government. So they can actually have access to all of my notes and the things that you're writing down, which I don't know what they are, because you're not going to give me a copy of it yet. And I just don't trust that they can actually get that information and because they're going to use it to their advantage and they're going to give it to the government, which I can't have that. So could you please not take notes?\",\n", " \"Well, because I made my room at home government proof, I think they're going to try to steal that technology when I actually create it so that they can then disrupt my room and then everybody else's rooms who are just like mine so that they can then use their mind control to then control me to do the things that they want me to do.\",\n", " 'Correct. Without my room, and without the technology and the systems and the things that I created, then the government will be able to control everything that I do.',\n", " 'I feel fine',\n", " \"I don't really have any problem Yeah, I just I feel fine. I don't feel happy. I don't feel sad. I just, I'm just here\",\n", " \"well, the government's gonna do what they're gonna do anyway. So it doesn't matter if I'm happy sad, pissed off angry. any of that the government's going to make me do what they want me to do anyway. So it doesn't matter how I feel.\",\n", " \"No, because I've learned my lesson to not say those things out loud into to talk about what the government's doing because I can't actually inform anybody else because nobody's listening to me because They're already controlled by the government because they don't have a room like I do at home.\",\n", " \"While people don't understand what's going on, then that's their fault. that's their problem. They should have listened to me. And I could actually gave them the technology that they needed to actually create the room so that the government can't hear them.\",\n", " 'No. Quite the opposite. It is completely opposite.',\n", " \"Okay. Yeah, we live with my mom. She does that she's supposed to she doesn't go in my room because she knows that. She goes into the robots gonna mess up the technology so she stays out.\",\n", " 'I talk to people all the time, but they are in my head. We have conversations all the time about how we can actually make the world a better place and then how we can actually go through and then destroy the government or do good. Just pump it up.',\n", " 'that would be good', 'Three days works.'], dtype=object)" ] }, "execution_count": 332, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_list3[110]['prompt'].values" ] }, { "cell_type": "code", "execution_count": 311, "metadata": {}, "outputs": [], "source": [ "df_final = pd.DataFrame(columns=['prompt', 'completion'])" ] }, { "cell_type": "code", "execution_count": 318, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/m_/_mwt902x1z595k65b7qf743m0000gn/T/ipykernel_82735/1421327753.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", " df_final = df_final.append(i, ignore_index=True)\n" ] } ], "source": [ "for i in df_list3:\n", " df_final = df_final.append(i, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 319, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | prompt | \n", "completion | \n", "
---|---|---|
0 | \n", "Good, good. | \n", "It's good to see you. Yeah. | \n", "
1 | \n", "Yeah, I mean, good week with the roommates. I ... | \n", "Silence. Terrific. So what was what specifical... | \n", "
2 | \n", "is just telling her that like, my studies are ... | \n", "right. Right. Right. Because I think that's re... | \n", "
3 | \n", "So I think being a little way direct was helpful. | \n", "It's really nice, I think to you've been able ... | \n", "
4 | \n", "At home for two days, and it was with my mom a... | \n", "okay. So she's still sixers. Did she have a bi... | \n", "
... | \n", "... | \n", "... | \n", "
6439 | \n", "I actually came across a program, which is per... | \n", "So the program that you're interested, you wan... | \n", "
6440 | \n", "I want you to see if, if, if I could choose th... | \n", "Okay, well looking at my schedule, it probably... | \n", "
6441 | \n", "Right? I will. My daughter is 17 years old, ok... | \n", "So you're looking, you're looking for a day pr... | \n", "
6442 | \n", "Right. Right. And, I mean, I was looking for a... | \n", "You while you were there, it was difficult. Yo... | \n", "
6443 | \n", "no, she did not she. She just there just wasn'... | \n", "And so so just there wasn't when when you visi... | \n", "
6444 rows × 2 columns
\n", "