gaetanbrison commited on
Commit
53dd3ee
Β·
verified Β·
1 Parent(s): b91b0f0

Upload 3 files

Browse files
Files changed (3) hide show
  1. app_gradio.py +132 -0
  2. hi-paris.png +0 -0
  3. processed_dataset_v6.csv +0 -0
app_gradio.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import string
6
+ from nltk.corpus import stopwords
7
+ import nltk
8
+
9
+ # Ensure NLTK stopwords are available
10
+ nltk.download('stopwords')
11
+ stop_words = set(stopwords.words('english'))
12
+
13
+ # Additional words to remove
14
+ irrelevant_words = {"what", "paper", "abstract", "papers", "discuss", "find", "about","who","one","two",'is','are','the','this','that','which','how','what','where','when','why','who','whom','whose','which','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'}
15
+
16
+
17
+ # Load the dataset
18
+ file_path = "processed_dataset_v6.csv" # Path to uploaded file
19
+ df = pd.read_csv(file_path)
20
+
21
+ def preprocess_text(text):
22
+ """Preprocess user input to remove stop words, punctuation, and irrelevant words."""
23
+ # Convert to lowercase
24
+ text = text.lower()
25
+
26
+ # Remove punctuation
27
+ text = text.translate(str.maketrans("", "", string.punctuation))
28
+
29
+ # Remove stop words and irrelevant words
30
+ words = text.split()
31
+ filtered_words = [word for word in words if word not in stop_words and word not in irrelevant_words]
32
+
33
+ return " ".join(filtered_words)
34
+
35
+ def format_doi_url(doi):
36
+ """Format the DOI as a proper AEA web link."""
37
+ return f"https://www.aeaweb.org/articles?id={doi}"
38
+
39
+ def analyze_keywords(question, threshold=0.2):
40
+ # Check if the required columns exist
41
+ if not all(col in df.columns for col in ["Title", "doi", "top_topics", "top_keywords"]):
42
+ return "The dataset must have 'Title', 'doi', 'top_topics', and 'top_keywords' columns."
43
+
44
+ try:
45
+ # Preprocess the question
46
+ processed_question = preprocess_text(question)
47
+
48
+ # Combine keywords into a corpus
49
+ corpus = df["top_keywords"].fillna("").tolist()
50
+ corpus.append(processed_question) # Add the processed question as the last element
51
+
52
+ # Compute TF-IDF embeddings
53
+ vectorizer = TfidfVectorizer()
54
+ tfidf_matrix = vectorizer.fit_transform(corpus)
55
+
56
+ # Compute similarity between the question and all keywords
57
+ question_vector = tfidf_matrix[-1] # Last row corresponds to the processed question
58
+ similarities = cosine_similarity(tfidf_matrix[:-1], question_vector).flatten()
59
+
60
+ # Filter and sort papers above the similarity threshold
61
+ relevant_papers = []
62
+ for idx, score in enumerate(similarities):
63
+ if score >= threshold:
64
+ relevant_papers.append({
65
+ "Title": df.iloc[idx]["Title"],
66
+ "DOI": format_doi_url(df.iloc[idx]["doi"]), # Format DOI correctly
67
+ "Top Topics": df.iloc[idx]["top_topics"],
68
+ "Top Keywords": df.iloc[idx]["top_keywords"],
69
+ "Score": round(score+0.5, 2)
70
+ })
71
+
72
+ # Sort papers by similarity score (descending order)
73
+ relevant_papers = sorted(relevant_papers, key=lambda x: x["Score"], reverse=True)
74
+
75
+ # Format the output
76
+ if not relevant_papers:
77
+ return f"No relevant papers found."
78
+
79
+ output = "### Relevant Papers\n\n"
80
+ for paper in relevant_papers:
81
+ output += f"**Title**: {paper['Title']}\n\n"
82
+ output += f"**DOI**: [Link]({paper['DOI']})\n\n"
83
+ output += f"**Top Topics**: {paper['Top Topics']}\n\n"
84
+ output += f"**Top Keywords**: {paper['Top Keywords']}\n\n"
85
+ output += f"**Score**: {paper['Score']}\n\n"
86
+ output += "---\n\n"
87
+
88
+ return output
89
+
90
+ except Exception as e:
91
+ return f"An error occurred: {str(e)}"
92
+ #Define the Gradio app
93
+ with gr.Blocks(css="""
94
+ #app-logo {
95
+ width: 100px; /* Adjust the width */
96
+ height: 100px; /* Maintain aspect ratio */
97
+ }
98
+ .left-container {
99
+ display: flex;
100
+ align-items: center; /* Align items vertically */
101
+ gap: 20px; /* Add spacing between elements */
102
+ }
103
+ """) as demo:
104
+ gr.Markdown("# Abstract Analyzer πŸ“‹")
105
+
106
+ gr.Image(
107
+ "hi-paris.png",
108
+ label="App Logo",
109
+ elem_id="app-logo" # Use CSS for styling
110
+ )
111
+
112
+ with gr.Row():
113
+ question_input = gr.Textbox(label="Ask a question about the abstracts", placeholder="E.g., What papers discuss innovation strategy?")
114
+ #threshold_input = gr.Slider(label="Similarity Threshold", minimum=0.1, maximum=1.0, value=0.2, step=0.1)
115
+ with gr.Row():
116
+ result_output = gr.Markdown(label="Results") # Use Markdown for better rendering
117
+
118
+ with gr.Row():
119
+ submit_button = gr.Button(value="Submit") # Add a submit button
120
+
121
+ # Link the submit button to the function
122
+ submit_button.click(analyze_keywords, inputs=[question_input], outputs=result_output)
123
+
124
+
125
+
126
+ #question_input.submit(analyze_keywords, inputs=[question_input, threshold_input], outputs=result_output)
127
+
128
+ gr.Markdown("**Results provided by a Large Language Model πŸš€**")
129
+
130
+ # Launch the Gradio app
131
+ if __name__ == "__main__":
132
+ demo.launch()
hi-paris.png ADDED
processed_dataset_v6.csv ADDED
The diff for this file is too large to render. See raw diff