|  |  | 
					
						
						|  | import numpy as np | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import praw | 
					
						
						|  | from huggingface_hub import HfApi, HfFolder | 
					
						
						|  | import time | 
					
						
						|  | import os | 
					
						
						|  | from datetime import datetime | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | HfFolder.save_token(os.getenv("HF_TOKEN")) | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"), | 
					
						
						|  | client_secret= os.getenv("PRAW_CLIENT_SECRET"), | 
					
						
						|  | user_agent= os.getenv("RPAW_AGENT"), | 
					
						
						|  | check_for_async=False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | except praw.exceptions.PRAWException as e: | 
					
						
						|  | print(f"PRAW Exception: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(f"An error occurred: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000): | 
					
						
						|  |  | 
					
						
						|  | posts_data = [] | 
					
						
						|  |  | 
					
						
						|  | if subreddit_name: | 
					
						
						|  | subreddit = reddit.subreddit(subreddit_name) | 
					
						
						|  | if keywords: | 
					
						
						|  | posts = subreddit.search(keywords, limit=limit) | 
					
						
						|  | else: | 
					
						
						|  | posts = subreddit.hot(limit=limit) | 
					
						
						|  | else: | 
					
						
						|  | posts = reddit.subreddit("all").search(keywords, limit=limit) | 
					
						
						|  |  | 
					
						
						|  | for post in posts: | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  | post_data = { | 
					
						
						|  | "title": post.title, | 
					
						
						|  | "score": post.score, | 
					
						
						|  | "id": post.id, | 
					
						
						|  | "url": post.url, | 
					
						
						|  | "num_comments": post.num_comments, | 
					
						
						|  | "created": datetime.fromtimestamp(post.created), | 
					
						
						|  | "body": post.selftext, | 
					
						
						|  | "subreddit": post.subreddit.display_name | 
					
						
						|  | } | 
					
						
						|  | posts_data.append(post_data) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | time.sleep(0.1) | 
					
						
						|  |  | 
					
						
						|  | except praw.exceptions.PRAWException as e: | 
					
						
						|  | print(f"Error processing post {post.id}: {str(e)}") | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | df = pd.DataFrame(posts_data) | 
					
						
						|  | df['content'] = df['title'] + '\n' + df['body'] | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_comments(reddit, post_id, limit=100): | 
					
						
						|  | """ | 
					
						
						|  | Get top comments from a specific post. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | reddit: Reddit instance | 
					
						
						|  | post_id (str): ID of the post to get comments from | 
					
						
						|  | limit (int): Maximum number of comments to retrieve (default 100) | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | pd.DataFrame: DataFrame containing top comments data | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  | submission = reddit.submission(id=post_id) | 
					
						
						|  | comments_data = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | submission.comments.replace_more(limit=0) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | all_comments = submission.comments.list() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit] | 
					
						
						|  |  | 
					
						
						|  | for comment in sorted_comments: | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  | comment_data = { | 
					
						
						|  | 'comment_id': comment.id, | 
					
						
						|  | 'post_id': post_id, | 
					
						
						|  | 'post_title': submission.title, | 
					
						
						|  |  | 
					
						
						|  | 'body': comment.body, | 
					
						
						|  | 'score': comment.score, | 
					
						
						|  | 'created_utc': datetime.fromtimestamp(comment.created_utc) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | } | 
					
						
						|  | comments_data.append(comment_data) | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(f"Error processing comment {comment.id}: {str(e)}") | 
					
						
						|  | continue | 
					
						
						|  | print(comments_data) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df = pd.DataFrame(comments_data) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if not df.empty: | 
					
						
						|  | print("sort comments by score") | 
					
						
						|  | df = df.sort_values('score', ascending=False) | 
					
						
						|  |  | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  | except praw.exceptions.PRAWException as e: | 
					
						
						|  | print(f"PRAW Exception while getting comments: {str(e)}") | 
					
						
						|  | return pd.DataFrame() | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(f"Error getting comments: {str(e)}") | 
					
						
						|  | return pd.DataFrame() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_comments_and_upload(df, dataset_repo_id): | 
					
						
						|  |  | 
					
						
						|  | api = HfApi() | 
					
						
						|  |  | 
					
						
						|  | existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for index, row in df.iterrows(): | 
					
						
						|  | csv_file_path = f"comments_{row['id']}.csv" | 
					
						
						|  | repo_csv_path = f"comments/{csv_file_path}" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | comments_df = get_comments(reddit, row['id']) | 
					
						
						|  |  | 
					
						
						|  | if len(comments_df) == 0: | 
					
						
						|  | print(f"No comments found for {row['id']}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | csv_file_path = f"comments_{row['id']}.csv" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | comments_df.to_csv(csv_file_path, index=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | api.upload_file( | 
					
						
						|  | path_or_fileobj=csv_file_path, | 
					
						
						|  | path_in_repo=f"comments/{csv_file_path}", | 
					
						
						|  | repo_id=dataset_repo_id, | 
					
						
						|  | repo_type="dataset" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | print(f"Uploaded {csv_file_path} to Hugging Face.") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | os.remove(csv_file_path) | 
					
						
						|  |  | 
					
						
						|  | print("All comments CSV files uploaded successfully!") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def main(): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | df = scrape_reddit(keywords="election") | 
					
						
						|  |  | 
					
						
						|  | if df is not None and not df.empty: | 
					
						
						|  | print(f"Successfully scraped {len(df)} posts") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df['created'] = pd.to_datetime(df['created'], unit='s') | 
					
						
						|  | df = df.sort_values(by='created', ascending=True) | 
					
						
						|  | df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions" | 
					
						
						|  |  | 
					
						
						|  | api = HfApi() | 
					
						
						|  | try: | 
					
						
						|  | api.dataset_info(dataset_repo_id) | 
					
						
						|  |  | 
					
						
						|  | print(f"Dataset {dataset_repo_id} already exists.") | 
					
						
						|  |  | 
					
						
						|  | except Exception: | 
					
						
						|  |  | 
					
						
						|  | print(f"Dataset {dataset_repo_id} will be created.") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | today_date = datetime.now().strftime('%Y%m%d') | 
					
						
						|  | filename = f"df_24_{today_date}.csv" | 
					
						
						|  |  | 
					
						
						|  | df_24.to_csv(filename, index=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | api.upload_file( | 
					
						
						|  | path_or_fileobj= filename, | 
					
						
						|  | path_in_repo=f"submissions/{filename}", | 
					
						
						|  | repo_id=dataset_repo_id, | 
					
						
						|  | repo_type="dataset" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | get_comments_and_upload(df_24, dataset_repo_id) | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  | print("No data was retrieved") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(f"Error in main: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == '__main__': | 
					
						
						|  | main() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  |