Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app_gradio.py +132 -0
- hi-paris.png +0 -0
- processed_dataset_v6.csv +0 -0
app_gradio.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import string
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
import nltk
|
8 |
+
|
9 |
+
# Ensure NLTK stopwords are available
|
10 |
+
nltk.download('stopwords')
|
11 |
+
stop_words = set(stopwords.words('english'))
|
12 |
+
|
13 |
+
# Additional words to remove
|
14 |
+
irrelevant_words = {"what", "paper", "abstract", "papers", "discuss", "find", "about","who","one","two",'is','are','the','this','that','which','how','what','where','when','why','who','whom','whose','which','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'}
|
15 |
+
|
16 |
+
|
17 |
+
# Load the dataset
|
18 |
+
file_path = "processed_dataset_v6.csv" # Path to uploaded file
|
19 |
+
df = pd.read_csv(file_path)
|
20 |
+
|
21 |
+
def preprocess_text(text):
|
22 |
+
"""Preprocess user input to remove stop words, punctuation, and irrelevant words."""
|
23 |
+
# Convert to lowercase
|
24 |
+
text = text.lower()
|
25 |
+
|
26 |
+
# Remove punctuation
|
27 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
28 |
+
|
29 |
+
# Remove stop words and irrelevant words
|
30 |
+
words = text.split()
|
31 |
+
filtered_words = [word for word in words if word not in stop_words and word not in irrelevant_words]
|
32 |
+
|
33 |
+
return " ".join(filtered_words)
|
34 |
+
|
35 |
+
def format_doi_url(doi):
|
36 |
+
"""Format the DOI as a proper AEA web link."""
|
37 |
+
return f"https://www.aeaweb.org/articles?id={doi}"
|
38 |
+
|
39 |
+
def analyze_keywords(question, threshold=0.2):
|
40 |
+
# Check if the required columns exist
|
41 |
+
if not all(col in df.columns for col in ["Title", "doi", "top_topics", "top_keywords"]):
|
42 |
+
return "The dataset must have 'Title', 'doi', 'top_topics', and 'top_keywords' columns."
|
43 |
+
|
44 |
+
try:
|
45 |
+
# Preprocess the question
|
46 |
+
processed_question = preprocess_text(question)
|
47 |
+
|
48 |
+
# Combine keywords into a corpus
|
49 |
+
corpus = df["top_keywords"].fillna("").tolist()
|
50 |
+
corpus.append(processed_question) # Add the processed question as the last element
|
51 |
+
|
52 |
+
# Compute TF-IDF embeddings
|
53 |
+
vectorizer = TfidfVectorizer()
|
54 |
+
tfidf_matrix = vectorizer.fit_transform(corpus)
|
55 |
+
|
56 |
+
# Compute similarity between the question and all keywords
|
57 |
+
question_vector = tfidf_matrix[-1] # Last row corresponds to the processed question
|
58 |
+
similarities = cosine_similarity(tfidf_matrix[:-1], question_vector).flatten()
|
59 |
+
|
60 |
+
# Filter and sort papers above the similarity threshold
|
61 |
+
relevant_papers = []
|
62 |
+
for idx, score in enumerate(similarities):
|
63 |
+
if score >= threshold:
|
64 |
+
relevant_papers.append({
|
65 |
+
"Title": df.iloc[idx]["Title"],
|
66 |
+
"DOI": format_doi_url(df.iloc[idx]["doi"]), # Format DOI correctly
|
67 |
+
"Top Topics": df.iloc[idx]["top_topics"],
|
68 |
+
"Top Keywords": df.iloc[idx]["top_keywords"],
|
69 |
+
"Score": round(score+0.5, 2)
|
70 |
+
})
|
71 |
+
|
72 |
+
# Sort papers by similarity score (descending order)
|
73 |
+
relevant_papers = sorted(relevant_papers, key=lambda x: x["Score"], reverse=True)
|
74 |
+
|
75 |
+
# Format the output
|
76 |
+
if not relevant_papers:
|
77 |
+
return f"No relevant papers found."
|
78 |
+
|
79 |
+
output = "### Relevant Papers\n\n"
|
80 |
+
for paper in relevant_papers:
|
81 |
+
output += f"**Title**: {paper['Title']}\n\n"
|
82 |
+
output += f"**DOI**: [Link]({paper['DOI']})\n\n"
|
83 |
+
output += f"**Top Topics**: {paper['Top Topics']}\n\n"
|
84 |
+
output += f"**Top Keywords**: {paper['Top Keywords']}\n\n"
|
85 |
+
output += f"**Score**: {paper['Score']}\n\n"
|
86 |
+
output += "---\n\n"
|
87 |
+
|
88 |
+
return output
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
return f"An error occurred: {str(e)}"
|
92 |
+
#Define the Gradio app
|
93 |
+
with gr.Blocks(css="""
|
94 |
+
#app-logo {
|
95 |
+
width: 100px; /* Adjust the width */
|
96 |
+
height: 100px; /* Maintain aspect ratio */
|
97 |
+
}
|
98 |
+
.left-container {
|
99 |
+
display: flex;
|
100 |
+
align-items: center; /* Align items vertically */
|
101 |
+
gap: 20px; /* Add spacing between elements */
|
102 |
+
}
|
103 |
+
""") as demo:
|
104 |
+
gr.Markdown("# Abstract Analyzer π")
|
105 |
+
|
106 |
+
gr.Image(
|
107 |
+
"hi-paris.png",
|
108 |
+
label="App Logo",
|
109 |
+
elem_id="app-logo" # Use CSS for styling
|
110 |
+
)
|
111 |
+
|
112 |
+
with gr.Row():
|
113 |
+
question_input = gr.Textbox(label="Ask a question about the abstracts", placeholder="E.g., What papers discuss innovation strategy?")
|
114 |
+
#threshold_input = gr.Slider(label="Similarity Threshold", minimum=0.1, maximum=1.0, value=0.2, step=0.1)
|
115 |
+
with gr.Row():
|
116 |
+
result_output = gr.Markdown(label="Results") # Use Markdown for better rendering
|
117 |
+
|
118 |
+
with gr.Row():
|
119 |
+
submit_button = gr.Button(value="Submit") # Add a submit button
|
120 |
+
|
121 |
+
# Link the submit button to the function
|
122 |
+
submit_button.click(analyze_keywords, inputs=[question_input], outputs=result_output)
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
#question_input.submit(analyze_keywords, inputs=[question_input, threshold_input], outputs=result_output)
|
127 |
+
|
128 |
+
gr.Markdown("**Results provided by a Large Language Model π**")
|
129 |
+
|
130 |
+
# Launch the Gradio app
|
131 |
+
if __name__ == "__main__":
|
132 |
+
demo.launch()
|
hi-paris.png
ADDED
![]() |
processed_dataset_v6.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|