Spaces:
Sleeping
Sleeping
Update data_cleaning.py
Browse files- data_cleaning.py +2 -8
data_cleaning.py
CHANGED
@@ -10,19 +10,13 @@ def clean_data(df):
|
|
10 |
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
|
11 |
|
12 |
# Drop rows where 'Reply' column is missing
|
13 |
-
before = df.shape[0]
|
14 |
df.dropna(subset=['Reply'], inplace=True)
|
15 |
-
after = df.shape[0]
|
16 |
|
17 |
# Calculate comment and reply lengths
|
18 |
df['comment_length'] = df['Comment'].str.len()
|
19 |
df['reply_length'] = df['Reply'].str.len()
|
20 |
|
21 |
# Remove duplicate rows
|
22 |
-
|
23 |
-
df_deduplicated = df.drop_duplicates()
|
24 |
|
25 |
-
|
26 |
-
print('Number of duplicate rows:', num_duplicates)
|
27 |
-
|
28 |
-
return df_deduplicated
|
|
|
10 |
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
|
11 |
|
12 |
# Drop rows where 'Reply' column is missing
|
|
|
13 |
df.dropna(subset=['Reply'], inplace=True)
|
|
|
14 |
|
15 |
# Calculate comment and reply lengths
|
16 |
df['comment_length'] = df['Comment'].str.len()
|
17 |
df['reply_length'] = df['Reply'].str.len()
|
18 |
|
19 |
# Remove duplicate rows
|
20 |
+
df = df.drop_duplicates()
|
|
|
21 |
|
22 |
+
return df
|
|
|
|
|
|