Spaces:

huggingface
/

paper-central

Running

App Files Files Community

IAMJB commited on Oct 1, 2024

Commit

df43c05

1 Parent(s): ff6d03f

author, github

Browse files

Files changed (5) hide show

app.py +43 -19
constants.py +2 -0
df/PaperCentral.py +57 -53
style.css +12 -0
utils.py +10 -4

app.py CHANGED Viewed

@@ -58,7 +58,8 @@ with gr.Blocks(css="style.css") as demo:
             )
             hf_options = gr.CheckboxGroup(
                 label="Hugging Face options",
-                choices=["show_details", "datasets", "models", "spaces"]
             )
         with gr.Column():
@@ -68,6 +69,11 @@ with gr.Blocks(css="style.css") as demo:
                 choices=["In proceedings"] + PaperCentral.CONFERENCES
             )
     # Define the Dataframe component to display paper data
     # List of columns in your DataFrame
     columns = paper_central_df.COLUMNS_START_PAPER_PAGE
@@ -91,7 +97,8 @@ with gr.Blocks(css="style.css") as demo:
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
-            conference_options_list: List[str]
     ) -> tuple:
         """
         Moves the selected date to the next day and updates the data.
@@ -120,7 +127,8 @@ with gr.Blocks(css="style.css") as demo:
             selected_date=new_date_str,
             cat_options=cat_options_list,
             hf_options=hf_options_list,
-            conference_options=conference_options_list
         )
         # Return the new date and updated Dataframe
@@ -132,7 +140,8 @@ with gr.Blocks(css="style.css") as demo:
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
-            conference_options_list: List[str]
     ) -> tuple:
         """
         Moves the selected date to the previous day and updates the data.
@@ -161,7 +170,8 @@ with gr.Blocks(css="style.css") as demo:
             selected_date=new_date_str,
             cat_options=cat_options_list,
             hf_options=hf_options_list,
-            conference_options=conference_options_list
         )
         # Return the new date and updated Dataframe
@@ -173,7 +183,8 @@ with gr.Blocks(css="style.css") as demo:
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
-            conference_options_list: List[str]
     ):
         """
         Updates the data displayed in the Dataframe based on the selected date and options.
@@ -191,7 +202,8 @@ with gr.Blocks(css="style.css") as demo:
             selected_date=date,
             cat_options=cat_options_list,
             hf_options=hf_options_list,
-            conference_options=conference_options_list
         )
@@ -200,7 +212,8 @@ with gr.Blocks(css="style.css") as demo:
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
-            conference_options_list: List[str]
     ):
         cat_options_update = gr.update()
@@ -216,6 +229,7 @@ with gr.Blocks(css="style.css") as demo:
                 [],
                 hf_options_list,
                 conference_options_list,
             )
             visible = False
@@ -231,7 +245,8 @@ with gr.Blocks(css="style.css") as demo:
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
-            conference_options_list: List[str]
     ):
         conference_options_update = gr.update()
         paper_central_component_update = gr.update()
@@ -246,6 +261,7 @@ with gr.Blocks(css="style.css") as demo:
                 cat_options_list,
                 hf_options_list,
                 [],
             )
             visible = True
@@ -256,29 +272,36 @@ with gr.Blocks(css="style.css") as demo:
         return paper_central_component_update, conference_options_update, calendar_update, next_day_btn_update, prev_day_btn_update
     # Set up the event listener for the 'Next Day' button
     next_day_btn.click(
         fn=go_to_next_day,
-        inputs=[calendar, cat_options, hf_options, conference_options],
         outputs=[calendar, paper_central_component],
     )
     # Set up the event listener for the 'Previous Day' button
     prev_day_btn.click(
         fn=go_to_previous_day,
-        inputs=[calendar, cat_options, hf_options, conference_options],
         outputs=[calendar, paper_central_component],
     )
-    # Define the inputs for the filter function
-    inputs = [
-        calendar,
-        cat_options,
-        hf_options,
-        conference_options,
-    ]
     # Set up the event listener for the calendar date change
     calendar.change(
         fn=update_data,
@@ -293,6 +316,7 @@ with gr.Blocks(css="style.css") as demo:
         outputs=paper_central_component,
     )
     # Event chaining for conference options change
     conference_options.change(
         fn=on_conference_options_change,

             )
             hf_options = gr.CheckboxGroup(
                 label="Hugging Face options",
+                choices=["🤗 paper-page", "datasets", "models", "spaces", "github"],
+                elem_id="hf_options"
             )
         with gr.Column():
                 choices=["In proceedings"] + PaperCentral.CONFERENCES
             )
+            # Define a Textbox for author search
+            author_search = gr.Textbox(
+                label="Search Authors",
+                placeholder="Enter author name",
+            )
     # Define the Dataframe component to display paper data
     # List of columns in your DataFrame
     columns = paper_central_df.COLUMNS_START_PAPER_PAGE
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
+            conference_options_list: List[str],
+            author_search_input: str,
     ) -> tuple:
         """
         Moves the selected date to the next day and updates the data.
             selected_date=new_date_str,
             cat_options=cat_options_list,
             hf_options=hf_options_list,
+            conference_options=conference_options_list,
+            author_search_input=author_search_input,
         )
         # Return the new date and updated Dataframe
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
+            conference_options_list: List[str],
+            author_search_input: str,
     ) -> tuple:
         """
         Moves the selected date to the previous day and updates the data.
             selected_date=new_date_str,
             cat_options=cat_options_list,
             hf_options=hf_options_list,
+            conference_options=conference_options_list,
+            author_search_input=author_search_input,
         )
         # Return the new date and updated Dataframe
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
+            conference_options_list: List[str],
+            author_search_input: str,
     ):
         """
         Updates the data displayed in the Dataframe based on the selected date and options.
             selected_date=date,
             cat_options=cat_options_list,
             hf_options=hf_options_list,
+            conference_options=conference_options_list,
+            author_search_input=author_search_input,
         )
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
+            conference_options_list: List[str],
+            author_search_input: str,
     ):
         cat_options_update = gr.update()
                 [],
                 hf_options_list,
                 conference_options_list,
+                author_search_input,
             )
             visible = False
             date: Union[str, datetime],
             cat_options_list: List[str],
             hf_options_list: List[str],
+            conference_options_list: List[str],
+            author_search_input: str,
     ):
         conference_options_update = gr.update()
         paper_central_component_update = gr.update()
                 cat_options_list,
                 hf_options_list,
                 [],
+                author_search_input,
             )
             visible = True
         return paper_central_component_update, conference_options_update, calendar_update, next_day_btn_update, prev_day_btn_update
+    inputs = [
+        calendar,
+        cat_options,
+        hf_options,
+        conference_options,
+        author_search,
+    ]
+    # Set up the event listener for the author search
+    author_search.submit(
+        fn=update_data,
+        inputs=inputs,
+        outputs=paper_central_component,
+    )
     # Set up the event listener for the 'Next Day' button
     next_day_btn.click(
         fn=go_to_next_day,
+        inputs=inputs,
         outputs=[calendar, paper_central_component],
     )
     # Set up the event listener for the 'Previous Day' button
     prev_day_btn.click(
         fn=go_to_previous_day,
+        inputs=inputs,
         outputs=[calendar, paper_central_component],
     )
     # Set up the event listener for the calendar date change
     calendar.change(
         fn=update_data,
         outputs=paper_central_component,
     )
     # Event chaining for conference options change
     conference_options.change(
         fn=on_conference_options_change,

constants.py CHANGED Viewed

@@ -4,3 +4,5 @@ DATASET_CONFERENCE_PAPERS = "IAMJB/paper_conference_aggregate"
 DATASET_DAILY_PAPERS = "hysts-bot-data/daily-papers"
 DATASET_DAILY_PAPERS_STATS = "hysts-bot-data/daily-papers-stats"
 DATASET_COMMUNITY_SCIENCE = "huggingface/community-science-paper-v2"

 DATASET_DAILY_PAPERS = "hysts-bot-data/daily-papers"
 DATASET_DAILY_PAPERS_STATS = "hysts-bot-data/daily-papers-stats"
 DATASET_COMMUNITY_SCIENCE = "huggingface/community-science-paper-v2"
+# DATASET_PAPER_CENTRAL = "huggingface/paper-central-data"
+DATASET_PAPER_CENTRAL = "huggingface/paper-central-data-2"

df/PaperCentral.py CHANGED Viewed

@@ -5,6 +5,7 @@ from constants import (
     DATASET_CONFERENCE_PAPERS,
     DATASET_COMMUNITY_SCIENCE,
     NEURIPS_ICO,
 )
 import gradio as gr
 from utils import load_and_process
@@ -56,13 +57,15 @@ class PaperCentral:
         'num_models',
         'num_datasets',
         'num_spaces',
         'conference_name',
         'id',
         'type',
         'proceedings',
         'title',
-        'upvotes',
-        'num_comments',
     ]
     DATATYPES: Dict[str, str] = {
@@ -74,11 +77,13 @@ class PaperCentral:
         'num_models': 'markdown',
         'num_datasets': 'markdown',
         'num_spaces': 'markdown',
         'title': 'str',
         'proceedings': 'markdown',
         'conference_name': 'str',
         'id': 'str',
         'type': 'str',
     }
     # Mapping for renaming columns for display purposes
@@ -131,55 +136,13 @@ class PaperCentral:
             pd.DataFrame: The merged and processed DataFrame.
         """
         # Load datasets
-        arxiv_scan_papers: pd.DataFrame = load_and_process(DATASET_ARXIV_SCAN_PAPERS)[
-            ['arxiv_id', 'published_date', 'categories', 'title', 'primary_category',
-             'huggingface_urls']
         ]
-        arxiv_scan_papers['published_date'] = pd.to_datetime(arxiv_scan_papers['published_date']) + pd.DateOffset(
-            days=1)
-        community_science_papers: pd.DataFrame = load_and_process(DATASET_COMMUNITY_SCIENCE)[
-            ['arxiv_id', 'date', 'upvotes', 'num_comments', 'github', 'num_models', 'num_datasets', 'num_spaces',
-             'title']
-        ]
-        conference_papers: pd.DataFrame = load_and_process(DATASET_CONFERENCE_PAPERS)[
-            ['id', 'proceedings', 'type', 'arxiv_id', 'title', 'conference_name']
-        ]
-        # Merge arxiv_scan_papers and community_science_papers on 'arxiv_id'
-        merged_df: pd.DataFrame = pd.merge(arxiv_scan_papers, community_science_papers, on='arxiv_id', how='outer')
-        merged_df['title'] = merged_df['title_x'].combine_first(merged_df['title_y'])
-        merged_df = merged_df.drop(columns=['title_x', 'title_y'])
-        final_merged_df: pd.DataFrame = pd.merge(
-            merged_df,
-            conference_papers,
-            on='arxiv_id',
-            how='outer'
-        )
-        # Combine the 'title' columns into one
-        final_merged_df['title'] = final_merged_df['title_x'].combine_first(final_merged_df['title_y'])
-        # Drop the redundant 'title_x' and 'title_y' columns
-        final_merged_df = final_merged_df.drop(columns=['title_x', 'title_y'])
-        # Use 'date' from community_science_papers if available; otherwise, use 'published_date'
-        final_merged_df['date'] = final_merged_df['date'].combine_first(final_merged_df['published_date'])
-        final_merged_df.drop(columns=['published_date'], inplace=True)
-        # If 'arxiv_id' is in community_science_papers, set 'paper_page' to 'arxiv_id'
-        final_merged_df.loc[
-            final_merged_df['arxiv_id'].isin(community_science_papers['arxiv_id']), 'paper_page'
-        ] = final_merged_df['arxiv_id']
-        # Format the 'date' column
-        final_merged_df = PaperCentral.format_df_date(final_merged_df, "date")
-        final_merged_df['date'] = final_merged_df['date'].astype(str)
-        print(final_merged_df.head())
-        return final_merged_df
     @staticmethod
     def format_df_date(df: pd.DataFrame, date_column: str = "date") -> pd.DataFrame:
@@ -259,11 +222,11 @@ class PaperCentral:
             ### This should be processed last :)
             ####
             # Add markdown link to 'paper_page' if it exists
-            if 'paper_page' in row and pd.notna(row['paper_page']):
                 row['paper_page'] = f"🤗[paper_page](https://huggingface.co/papers/{row['paper_page']})"
             # Add image and link to 'arxiv_id' if it exists
-            if 'arxiv_id' in row and pd.notna(row['arxiv_id']):
                 image_url = "https://arxiv.org/static/browse/0.3.4/images/icons/favicon-16x16.png"
                 style = "display:inline-block; vertical-align:middle;"
                 row['arxiv_id'] = (
@@ -271,6 +234,15 @@ class PaperCentral:
                     f"<a href='https://arxiv.org/abs/{row['arxiv_id']}'>arxiv_page</a>"
                 )
             return row
         df = df.copy()
@@ -302,7 +274,8 @@ class PaperCentral:
             selected_date: Optional[str] = None,
             cat_options: Optional[List[str]] = None,
             hf_options: Optional[List[str]] = None,
-            conference_options: Optional[List[str]] = None
     ) -> gr.update:
         """
         Filter the DataFrame based on selected date and options, and prepare it for display.
@@ -320,6 +293,32 @@ class PaperCentral:
         # Start with the initial columns to display
         columns_to_show: List[str] = PaperCentral.COLUMNS_START_PAPER_PAGE.copy()
         if cat_options:
             options = [o.replace(".*", "") for o in cat_options]
             # Initialize filter series
@@ -339,7 +338,7 @@ class PaperCentral:
         # HF options
         if hf_options:
-            if "show_details" in hf_options:
                 # Filter rows where 'paper_page' is not empty or NaN
                 filtered_df = filtered_df[
                     (filtered_df['paper_page'] != "") & (filtered_df['paper_page'].notna())
@@ -371,6 +370,11 @@ class PaperCentral:
                     columns_to_show.append('num_spaces')
                 filtered_df = filtered_df[filtered_df['num_spaces'] != 0]
         # Apply conference filtering
         if conference_options:

     DATASET_CONFERENCE_PAPERS,
     DATASET_COMMUNITY_SCIENCE,
     NEURIPS_ICO,
+    DATASET_PAPER_CENTRAL,
 )
 import gradio as gr
 from utils import load_and_process
         'num_models',
         'num_datasets',
         'num_spaces',
+        'upvotes',
+        'num_comments',
+        'github',
         'conference_name',
         'id',
         'type',
         'proceedings',
         'title',
+        'authors',
     ]
     DATATYPES: Dict[str, str] = {
         'num_models': 'markdown',
         'num_datasets': 'markdown',
         'num_spaces': 'markdown',
+        'github': 'markdown',
         'title': 'str',
         'proceedings': 'markdown',
         'conference_name': 'str',
         'id': 'str',
         'type': 'str',
+        'authors': 'str',
     }
     # Mapping for renaming columns for display purposes
             pd.DataFrame: The merged and processed DataFrame.
         """
         # Load datasets
+        paper_central_df: pd.DataFrame = load_and_process(DATASET_PAPER_CENTRAL)[
+            ['arxiv_id', 'categories', 'primary_category', 'date', 'upvotes', 'num_comments', 'github', 'num_models',
+             'num_datasets', 'num_spaces', 'id', 'proceedings', 'type',
+             'conference_name', 'title', 'paper_page', 'authors']
         ]
+        return paper_central_df
     @staticmethod
     def format_df_date(df: pd.DataFrame, date_column: str = "date") -> pd.DataFrame:
             ### This should be processed last :)
             ####
             # Add markdown link to 'paper_page' if it exists
+            if 'paper_page' in row and pd.notna(row['paper_page']) and row['paper_page']:
                 row['paper_page'] = f"🤗[paper_page](https://huggingface.co/papers/{row['paper_page']})"
             # Add image and link to 'arxiv_id' if it exists
+            if 'arxiv_id' in row and pd.notna(row['arxiv_id']) and row['arxiv_id']:
                 image_url = "https://arxiv.org/static/browse/0.3.4/images/icons/favicon-16x16.png"
                 style = "display:inline-block; vertical-align:middle;"
                 row['arxiv_id'] = (
                     f"<a href='https://arxiv.org/abs/{row['arxiv_id']}'>arxiv_page</a>"
                 )
+            # Add image and link to 'arxiv_id' if it exists
+            if 'github' in row and pd.notna(row['github']) and row["github"]:
+                image_url = "https://github.githubassets.com/favicons/favicon.png"
+                style = "display:inline-block; vertical-align:middle;width:16px;"
+                row['github'] = (
+                    f"<img src='{image_url}' style='{style}'/>"
+                    f"<a href='{row['github']}'>github</a>"
+                )
             return row
         df = df.copy()
             selected_date: Optional[str] = None,
             cat_options: Optional[List[str]] = None,
             hf_options: Optional[List[str]] = None,
+            conference_options: Optional[List[str]] = None,
+            author_search_input: Optional[str] = None,
     ) -> gr.update:
         """
         Filter the DataFrame based on selected date and options, and prepare it for display.
         # Start with the initial columns to display
         columns_to_show: List[str] = PaperCentral.COLUMNS_START_PAPER_PAGE.copy()
+        if author_search_input:
+            if 'authors' not in columns_to_show:
+                columns_to_show.append('authors')
+            search_string = author_search_input.lower()
+            def author_matches(authors_list):
+                # Check if authors_list is None or empty
+                if authors_list is None or len(authors_list) == 0:
+                    return False
+                # Check if authors_list is an iterable (list, tuple, Series, or ndarray)
+                if isinstance(authors_list, (list, tuple, pd.Series, np.ndarray)):
+                    return any(
+                        isinstance(author, str) and search_string in author.lower()
+                        for author in authors_list
+                    )
+                elif isinstance(authors_list, str):
+                    # If authors_list is a single string
+                    return search_string in authors_list.lower()
+                else:
+                    # Handle unexpected data types
+                    return False
+            filtered_df = filtered_df[filtered_df['authors'].apply(author_matches)]
         if cat_options:
             options = [o.replace(".*", "") for o in cat_options]
             # Initialize filter series
         # HF options
         if hf_options:
+            if "🤗 paper-page" in hf_options:
                 # Filter rows where 'paper_page' is not empty or NaN
                 filtered_df = filtered_df[
                     (filtered_df['paper_page'] != "") & (filtered_df['paper_page'].notna())
                     columns_to_show.append('num_spaces')
                 filtered_df = filtered_df[filtered_df['num_spaces'] != 0]
+            if "github" in hf_options:
+                if 'github' not in columns_to_show:
+                    columns_to_show.append('github')
+                filtered_df = filtered_df[(filtered_df['github'] != "") & (filtered_df['github'].notnull())]
         # Apply conference filtering
         if conference_options:

style.css CHANGED Viewed

	@@ -21,3 +21,15 @@ body a:hover {
21	}
22
23

 }
+#hf_options label[for='github']::before {
+    content: "";
+    background-image: url('https://github.githubassets.com/favicons/favicon.png');
+    background-size: contain;
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    vertical-align: middle;
+    margin-right: 5px;
+}

utils.py CHANGED Viewed

@@ -1,11 +1,17 @@
-import re
 from datasets import load_dataset
 def arxiv_remove_version_suffix(arxiv_id):
-    # Use regex to remove version suffix (e.g., v1, v2, etc.) if present
-    cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
-    return cleaned_id
 # Load datasets

 from datasets import load_dataset
+import re
 def arxiv_remove_version_suffix(arxiv_id):
+    if arxiv_id is None:
+        return None
+    # Ensure arxiv_id is a string before applying regex
+    elif isinstance(arxiv_id, str):
+        cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
+        return cleaned_id
+    else:
+        # Handle unexpected types
+        return arxiv_id
 # Load datasets