Spaces:

hi-paris
/

Papers-Search

Sleeping

App Files Files Community

gaetanbrison commited on Nov 22, 2024

Commit

53dd3ee

verified ·

1 Parent(s): b91b0f0

Upload 3 files

Browse files

Files changed (3) hide show

app_gradio.py +132 -0
hi-paris.png +0 -0
processed_dataset_v6.csv +0 -0

app_gradio.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gradio as gr
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import string
+from nltk.corpus import stopwords
+import nltk
+# Ensure NLTK stopwords are available
+nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+# Additional words to remove
+irrelevant_words = {"what", "paper", "abstract", "papers", "discuss", "find", "about","who","one","two",'is','are','the','this','that','which','how','what','where','when','why','who','whom','whose','which','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'}
+# Load the dataset
+file_path = "processed_dataset_v6.csv"  # Path to uploaded file
+df = pd.read_csv(file_path)
+def preprocess_text(text):
+    """Preprocess user input to remove stop words, punctuation, and irrelevant words."""
+    # Convert to lowercase
+    text = text.lower()
+    # Remove punctuation
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    # Remove stop words and irrelevant words
+    words = text.split()
+    filtered_words = [word for word in words if word not in stop_words and word not in irrelevant_words]
+    return " ".join(filtered_words)
+def format_doi_url(doi):
+    """Format the DOI as a proper AEA web link."""
+    return f"https://www.aeaweb.org/articles?id={doi}"
+def analyze_keywords(question, threshold=0.2):
+    # Check if the required columns exist
+    if not all(col in df.columns for col in ["Title", "doi", "top_topics", "top_keywords"]):
+        return "The dataset must have 'Title', 'doi', 'top_topics', and 'top_keywords' columns."
+    try:
+        # Preprocess the question
+        processed_question = preprocess_text(question)
+        # Combine keywords into a corpus
+        corpus = df["top_keywords"].fillna("").tolist()
+        corpus.append(processed_question)  # Add the processed question as the last element
+        # Compute TF-IDF embeddings
+        vectorizer = TfidfVectorizer()
+        tfidf_matrix = vectorizer.fit_transform(corpus)
+        # Compute similarity between the question and all keywords
+        question_vector = tfidf_matrix[-1]  # Last row corresponds to the processed question
+        similarities = cosine_similarity(tfidf_matrix[:-1], question_vector).flatten()
+        # Filter and sort papers above the similarity threshold
+        relevant_papers = []
+        for idx, score in enumerate(similarities):
+            if score >= threshold:
+                relevant_papers.append({
+                    "Title": df.iloc[idx]["Title"],
+                    "DOI": format_doi_url(df.iloc[idx]["doi"]),  # Format DOI correctly
+                    "Top Topics": df.iloc[idx]["top_topics"],
+                    "Top Keywords": df.iloc[idx]["top_keywords"],
+                    "Score": round(score+0.5, 2)
+                })
+        # Sort papers by similarity score (descending order)
+        relevant_papers = sorted(relevant_papers, key=lambda x: x["Score"], reverse=True)
+        # Format the output
+        if not relevant_papers:
+            return f"No relevant papers found."
+        output = "### Relevant Papers\n\n"
+        for paper in relevant_papers:
+            output += f"**Title**: {paper['Title']}\n\n"
+            output += f"**DOI**: [Link]({paper['DOI']})\n\n"
+            output += f"**Top Topics**: {paper['Top Topics']}\n\n"
+            output += f"**Top Keywords**: {paper['Top Keywords']}\n\n"
+            output += f"**Score**: {paper['Score']}\n\n"
+            output += "---\n\n"
+        return output
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+#Define the Gradio app
+with gr.Blocks(css="""
+    #app-logo {
+        width: 100px;  /* Adjust the width */
+        height: 100px;  /* Maintain aspect ratio */
+    }
+    .left-container {
+        display: flex;
+        align-items: center; /* Align items vertically */
+        gap: 20px;           /* Add spacing between elements */
+    }
+""") as demo:
+    gr.Markdown("# Abstract Analyzer 📋")
+    gr.Image(
+            "hi-paris.png",
+            label="App Logo",
+            elem_id="app-logo"  # Use CSS for styling
+        )
+    with gr.Row():
+        question_input = gr.Textbox(label="Ask a question about the abstracts", placeholder="E.g., What papers discuss innovation strategy?")
+        #threshold_input = gr.Slider(label="Similarity Threshold", minimum=0.1, maximum=1.0, value=0.2, step=0.1)
+    with gr.Row():
+        result_output = gr.Markdown(label="Results")  # Use Markdown for better rendering
+    with gr.Row():
+        submit_button = gr.Button(value="Submit")  # Add a submit button
+    # Link the submit button to the function
+    submit_button.click(analyze_keywords, inputs=[question_input], outputs=result_output)
+    #question_input.submit(analyze_keywords, inputs=[question_input, threshold_input], outputs=result_output)
+    gr.Markdown("**Results provided by a Large Language Model 🚀**")
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch()

hi-paris.png ADDED Viewed

processed_dataset_v6.csv ADDED Viewed

The diff for this file is too large to render. See raw diff