|  | import gradio as gr | 
					
						
						|  | from datasets import load_dataset | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import sys | 
					
						
						|  | import subprocess | 
					
						
						|  | from datetime import datetime | 
					
						
						|  | from huggingface_hub import HfApi | 
					
						
						|  |  | 
					
						
						|  | def get_newest_file(repo_id, prefix): | 
					
						
						|  | """Get the newest file with given prefix from HuggingFace repo""" | 
					
						
						|  | api = HfApi() | 
					
						
						|  | files = api.list_repo_files(repo_id, repo_type="dataset") | 
					
						
						|  | relevant_files = [f for f in files if f.startswith(prefix)] | 
					
						
						|  |  | 
					
						
						|  | if not relevant_files: | 
					
						
						|  | return None | 
					
						
						|  |  | 
					
						
						|  | file_dates = [] | 
					
						
						|  | for filename in relevant_files: | 
					
						
						|  | try: | 
					
						
						|  | date_str = filename.split('_')[-1].split('.')[0] | 
					
						
						|  | date = datetime.strptime(date_str, '%Y%m%d') | 
					
						
						|  | file_dates.append((date, filename)) | 
					
						
						|  | except (IndexError, ValueError): | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | if not file_dates: | 
					
						
						|  | return None | 
					
						
						|  |  | 
					
						
						|  | newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] | 
					
						
						|  | return newest_file | 
					
						
						|  |  | 
					
						
						|  | def load_data(repo_id, file_path): | 
					
						
						|  | """Load data from HuggingFace and return as DataFrame""" | 
					
						
						|  | try: | 
					
						
						|  | dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train') | 
					
						
						|  | df = pd.DataFrame(dataset) | 
					
						
						|  | return df.head(3) | 
					
						
						|  | except Exception as e: | 
					
						
						|  | return pd.DataFrame({'Error': [str(e)]}) | 
					
						
						|  |  | 
					
						
						|  | def praw_new_data(): | 
					
						
						|  | """Execute praw.py and show the latest data""" | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | subprocess.run([sys.executable, "praw.py"], check=True) | 
					
						
						|  | success_message = "β
 Successfully crawled new data!" | 
					
						
						|  | except Exception as e: | 
					
						
						|  | success_message = f"β Error executing praw.py: {str(e)}" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | repo_id = "Vera-ZWY/reddite2024elections_submissions" | 
					
						
						|  | newest_file = get_newest_file(repo_id, "submissions/df_") | 
					
						
						|  |  | 
					
						
						|  | if newest_file: | 
					
						
						|  | df = load_data(repo_id, newest_file) | 
					
						
						|  | return success_message, df, load_merged_data()[1] | 
					
						
						|  | else: | 
					
						
						|  | return "No crawled data files found", pd.DataFrame(), load_merged_data()[1] | 
					
						
						|  |  | 
					
						
						|  | def merge_data(): | 
					
						
						|  | """Execute merge.py and show the latest merged data""" | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | subprocess.run([sys.executable, "merge.py"], check=True) | 
					
						
						|  | success_message = "β
 Successfully merged data!" | 
					
						
						|  | except Exception as e: | 
					
						
						|  | success_message = f"β Error executing merge.py: {str(e)}" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | merged_df = load_merged_data()[1] | 
					
						
						|  | crawled_df = load_crawled_data()[1] | 
					
						
						|  | return success_message, crawled_df, merged_df | 
					
						
						|  |  | 
					
						
						|  | def load_crawled_data(): | 
					
						
						|  | """Load latest crawled data""" | 
					
						
						|  | repo_id = "Vera-ZWY/reddite2024elections_submissions" | 
					
						
						|  | newest_file = get_newest_file(repo_id, "submissions/df_24") | 
					
						
						|  |  | 
					
						
						|  | if newest_file: | 
					
						
						|  | return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file) | 
					
						
						|  | return "No crawled data available", pd.DataFrame() | 
					
						
						|  |  | 
					
						
						|  | def load_merged_data(): | 
					
						
						|  | """Load latest merged data""" | 
					
						
						|  | repo_id = "Vera-ZWY/reddite2024elections_submissions" | 
					
						
						|  | newest_merged =  "submission/merged_reddit_data.csv" | 
					
						
						|  |  | 
					
						
						|  | if newest_merged: | 
					
						
						|  | return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged) | 
					
						
						|  | return "No merged data available", pd.DataFrame() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks(title="Reddit Data Processing") as iface: | 
					
						
						|  | gr.Markdown("# Reddit Data Processing Interface") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | status_text = gr.Textbox(label="Status", interactive=False) | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | praw_button = gr.Button("Crawl New Data", variant="primary") | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | merge_button = gr.Button("Merge Data", variant="primary") | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown("### Latest Crawled Data (Top 3 Rows)") | 
					
						
						|  | crawled_table = gr.Dataframe( | 
					
						
						|  | headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"], | 
					
						
						|  | value=load_crawled_data()[1], | 
					
						
						|  | wrap=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown("### Latest Merged Data (Top 3 Rows)") | 
					
						
						|  | merged_table = gr.Dataframe( | 
					
						
						|  | headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"], | 
					
						
						|  | value=load_merged_data()[1], | 
					
						
						|  | wrap=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | praw_button.click( | 
					
						
						|  | fn=praw_new_data, | 
					
						
						|  | outputs=[status_text, crawled_table, merged_table] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | merge_button.click( | 
					
						
						|  | fn=merge_data, | 
					
						
						|  | outputs=[status_text, crawled_table, merged_table] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown(""" | 
					
						
						|  | ## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/ | 
					
						
						|  | ### Instructions: | 
					
						
						|  | 1. Click 'Crawl New Data' to fetch new Reddit data | 
					
						
						|  | 2. Click 'Merge Data' to merge the latest datasets | 
					
						
						|  | 3. Tables will automatically update to show the latest data | 
					
						
						|  | """) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | iface.launch() |