Abu1998 commited on
Commit
49bf9dd
·
verified ·
1 Parent(s): 67c445f

Update data_cleaning.py

Browse files
Files changed (1) hide show
  1. data_cleaning.py +2 -8
data_cleaning.py CHANGED
@@ -10,19 +10,13 @@ def clean_data(df):
10
  df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
11
 
12
  # Drop rows where 'Reply' column is missing
13
- before = df.shape[0]
14
  df.dropna(subset=['Reply'], inplace=True)
15
- after = df.shape[0]
16
 
17
  # Calculate comment and reply lengths
18
  df['comment_length'] = df['Comment'].str.len()
19
  df['reply_length'] = df['Reply'].str.len()
20
 
21
  # Remove duplicate rows
22
- num_duplicates = df.duplicated().sum()
23
- df_deduplicated = df.drop_duplicates()
24
 
25
- # Print number of duplicates
26
- print('Number of duplicate rows:', num_duplicates)
27
-
28
- return df_deduplicated
 
10
  df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
11
 
12
  # Drop rows where 'Reply' column is missing
 
13
  df.dropna(subset=['Reply'], inplace=True)
 
14
 
15
  # Calculate comment and reply lengths
16
  df['comment_length'] = df['Comment'].str.len()
17
  df['reply_length'] = df['Reply'].str.len()
18
 
19
  # Remove duplicate rows
20
+ df = df.drop_duplicates()
 
21
 
22
+ return df