Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| def clean_data(df): | |
| # Remove 'http://www.youtube.com/@' from the 'Channel' column | |
| df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False) | |
| # Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied' | |
| df['Comment'] = df['Comment'].fillna(method='ffill') | |
| df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill') | |
| df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel']) | |
| # Drop rows where 'Reply' column is missing | |
| before = df.shape[0] | |
| df.dropna(subset=['Reply'], inplace=True) | |
| after = df.shape[0] | |
| # Calculate comment and reply lengths | |
| df['comment_length'] = df['Comment'].str.len() | |
| df['reply_length'] = df['Reply'].str.len() | |
| # Remove duplicate rows | |
| num_duplicates = df.duplicated().sum() | |
| df_deduplicated = df.drop_duplicates() | |
| # Print number of duplicates | |
| print('Number of duplicate rows:', num_duplicates) | |
| return df_deduplicated | |