Spaces:

mangoesai
/

Refresh_Praw_pinecone_dataset

Sleeping

App Files Files Community

Vera-ZWY commited on Nov 20, 2024

Commit

14cf6e5

verified ·

1 Parent(s): 8b96174

Create praw_auhtor_info.py

Browse files

Files changed (1) hide show

praw_auhtor_info.py +68 -0

praw_auhtor_info.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from datasets import load_dataset
+import pandas as pd
+import praw
+import time
+from tqdm import tqdm
+def initialize_reddit():
+    return praw.Reddit(
+        client_id="RPAW_CLIENT_ID",
+        client_secret="RPAW_CLIENT_SECRET",
+        user_agent="PRAW_AGENT"
+    )
+def get_author_info(reddit, submission_id):
+    try:
+        submission = reddit.submission(id=submission_id)
+        author = submission.author
+        if author is None:
+            return {
+                'author_name': '[deleted]',
+                'karma': None,
+                'account_age_days': None,
+                'is_mod': None
+            }
+        return {
+            'author_name': author.name,
+            'karma': author.link_karma + author.comment_karma,
+            'account_age_days': (time.time() - author.created_utc) / 86400,
+            'is_mod': author.is_mod if hasattr(author, 'is_mod') else None
+        }
+    except Exception as e:
+        print(f"Error fetching author info for submission {submission_id}: {e}")
+        return {
+            'author_name': None,
+            'karma': None,
+            'account_age_days': None,
+            'is_mod': None
+        }
+def praw_auhtors_to_path(ds_repo_id, file_path):
+    # Initialize Reddit API
+    reddit = initialize_reddit()
+    # Load dataset from Hugging Face
+    dataset = load_dataset(ds_repo_id,
+                         data_files={'train': file_path},
+                         split='train')
+    df = pd.DataFrame(dataset)
+    # Fetch author info for each submission
+    author_data = []
+    for submission_id in tqdm(df['id']):
+        author_info = get_author_info(reddit, submission_id)
+        author_data.append(author_info)
+        time.sleep(1)  # Rate limiting
+    # Create DataFrame with author info
+    author_df = pd.DataFrame(author_data)
+    # Merge with original data
+    result_df = pd.concat([df, author_df], axis=1)
+    # Save result
+    output_file = f"submissions_with_authors_{time.strftime('%Y%m%d')}.csv"
+    result_df.to_csv(output_file, index=False)
+    print(f"Saved to {output_file}")