CCockrum commited on
Commit
abad8a2
Β·
verified Β·
1 Parent(s): 034aafa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +405 -0
app.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import json
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ import re
7
+ from typing import List, Dict, Tuple
8
+ import xml.etree.ElementTree as ET
9
+ from collections import Counter
10
+ import plotly.express as px
11
+ import plotly.graph_objects as go
12
+ from transformers import pipeline
13
+ import numpy as np
14
+
15
+ class CancerResearchLiteratureMiner:
16
+ def __init__(self):
17
+ # Initialize NLP pipelines
18
+ try:
19
+ self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
20
+ self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
21
+ except Exception as e:
22
+ print(f"Warning: Could not load transformers models: {e}")
23
+ self.summarizer = None
24
+ self.classifier = None
25
+
26
+ # Research categories for classification
27
+ self.research_categories = [
28
+ "drug discovery", "immunotherapy", "chemotherapy", "radiation therapy",
29
+ "biomarkers", "diagnostics", "metastasis", "tumor microenvironment",
30
+ "animal models", "preclinical studies", "toxicity", "pharmacokinetics"
31
+ ]
32
+
33
+ # Animal model keywords
34
+ self.animal_keywords = [
35
+ "mouse", "mice", "rat", "rats", "xenograft", "orthotopic", "transgenic",
36
+ "knockout", "immunodeficient", "nude mice", "SCID", "NOD", "PDX",
37
+ "patient-derived xenograft", "syngeneic", "canine", "dog", "feline", "cat"
38
+ ]
39
+
40
+ def search_pubmed(self, query: str, max_results: int = 50) -> List[Dict]:
41
+ """Search PubMed for cancer research papers"""
42
+ # Enhance query with animal model terms
43
+ enhanced_query = f"({query}) AND (animal model OR mouse OR mice OR rat OR xenograft OR preclinical)"
44
+
45
+ # Search PubMed
46
+ search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
47
+ search_params = {
48
+ "db": "pubmed",
49
+ "term": enhanced_query,
50
+ "retmax": max_results,
51
+ "retmode": "json",
52
+ "sort": "relevance"
53
+ }
54
+
55
+ try:
56
+ search_response = requests.get(search_url, params=search_params)
57
+ search_data = search_response.json()
58
+
59
+ if "esearchresult" not in search_data or not search_data["esearchresult"]["idlist"]:
60
+ return []
61
+
62
+ # Get detailed information
63
+ ids = search_data["esearchresult"]["idlist"]
64
+ fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
65
+ fetch_params = {
66
+ "db": "pubmed",
67
+ "id": ",".join(ids),
68
+ "retmode": "xml"
69
+ }
70
+
71
+ fetch_response = requests.get(fetch_url, params=fetch_params)
72
+
73
+ # Parse XML response
74
+ papers = self._parse_pubmed_xml(fetch_response.text)
75
+ return papers
76
+
77
+ except Exception as e:
78
+ return [{"error": f"Search failed: {str(e)}"}]
79
+
80
+ def _parse_pubmed_xml(self, xml_content: str) -> List[Dict]:
81
+ """Parse PubMed XML response"""
82
+ papers = []
83
+ try:
84
+ root = ET.fromstring(xml_content)
85
+
86
+ for article in root.findall(".//PubmedArticle"):
87
+ paper = {}
88
+
89
+ # Extract basic info
90
+ medline = article.find(".//MedlineCitation")
91
+ if medline is not None:
92
+ pmid = medline.find(".//PMID")
93
+ paper["pmid"] = pmid.text if pmid is not None else "N/A"
94
+
95
+ # Extract title
96
+ title = article.find(".//ArticleTitle")
97
+ paper["title"] = title.text if title is not None else "N/A"
98
+
99
+ # Extract abstract
100
+ abstract_elem = article.find(".//Abstract/AbstractText")
101
+ paper["abstract"] = abstract_elem.text if abstract_elem is not None else "N/A"
102
+
103
+ # Extract authors
104
+ authors = []
105
+ for author in article.findall(".//Author"):
106
+ fname = author.find(".//ForeName")
107
+ lname = author.find(".//LastName")
108
+ if fname is not None and lname is not None:
109
+ authors.append(f"{fname.text} {lname.text}")
110
+ paper["authors"] = ", ".join(authors[:3]) + ("..." if len(authors) > 3 else "")
111
+
112
+ # Extract journal and date
113
+ journal = article.find(".//Journal/Title")
114
+ paper["journal"] = journal.text if journal is not None else "N/A"
115
+
116
+ pub_date = article.find(".//PubDate/Year")
117
+ paper["year"] = pub_date.text if pub_date is not None else "N/A"
118
+
119
+ papers.append(paper)
120
+
121
+ except Exception as e:
122
+ return [{"error": f"XML parsing failed: {str(e)}"}]
123
+
124
+ return papers
125
+
126
+ def analyze_papers(self, papers: List[Dict]) -> Dict:
127
+ """Analyze the retrieved papers for insights"""
128
+ if not papers or papers[0].get("error"):
129
+ return {"error": "No papers to analyze"}
130
+
131
+ analysis = {
132
+ "total_papers": len(papers),
133
+ "year_distribution": {},
134
+ "animal_models": {},
135
+ "research_categories": {},
136
+ "key_findings": [],
137
+ "drug_mentions": [],
138
+ "methodology_trends": {}
139
+ }
140
+
141
+ # Analyze each paper
142
+ for paper in papers:
143
+ # Year distribution
144
+ year = paper.get("year", "Unknown")
145
+ analysis["year_distribution"][year] = analysis["year_distribution"].get(year, 0) + 1
146
+
147
+ # Analyze abstract for animal models and categories
148
+ abstract = paper.get("abstract", "").lower()
149
+ title = paper.get("title", "").lower()
150
+ full_text = f"{title} {abstract}"
151
+
152
+ # Animal model detection
153
+ for animal in self.animal_keywords:
154
+ if animal in full_text:
155
+ analysis["animal_models"][animal] = analysis["animal_models"].get(animal, 0) + 1
156
+
157
+ # Extract drug mentions (simple regex for common drug patterns)
158
+ drugs = re.findall(r'\b[A-Z][a-z]*(?:mab|nib|ine|ole|cin|tin)\b', paper.get("abstract", ""))
159
+ analysis["drug_mentions"].extend(drugs)
160
+
161
+ # Classify research category if classifier is available
162
+ if self.classifier and abstract != "n/a":
163
+ try:
164
+ result = self.classifier(abstract[:512], self.research_categories)
165
+ top_category = result["labels"][0]
166
+ analysis["research_categories"][top_category] = analysis["research_categories"].get(top_category, 0) + 1
167
+ except Exception:
168
+ pass
169
+
170
+ # Process drug mentions
171
+ drug_counter = Counter(analysis["drug_mentions"])
172
+ analysis["drug_mentions"] = dict(drug_counter.most_common(10))
173
+
174
+ return analysis
175
+
176
+ def generate_summary(self, papers: List[Dict], analysis: Dict) -> str:
177
+ """Generate a comprehensive summary of findings"""
178
+ if not papers or papers[0].get("error"):
179
+ return "No papers found or error in retrieval."
180
+
181
+ summary = f"""
182
+ # Literature Mining Summary
183
+
184
+ ## Overview
185
+ - **Total Papers Found**: {analysis['total_papers']}
186
+ - **Search Date**: {datetime.now().strftime('%Y-%m-%d')}
187
+
188
+ ## Key Insights
189
+
190
+ ### Animal Models Used
191
+ """
192
+
193
+ # Top animal models
194
+ if analysis["animal_models"]:
195
+ top_models = sorted(analysis["animal_models"].items(), key=lambda x: x[1], reverse=True)[:5]
196
+ for model, count in top_models:
197
+ summary += f"- **{model.title()}**: {count} papers\n"
198
+
199
+ summary += "\n### Research Focus Areas\n"
200
+
201
+ # Research categories
202
+ if analysis["research_categories"]:
203
+ top_categories = sorted(analysis["research_categories"].items(), key=lambda x: x[1], reverse=True)[:5]
204
+ for category, count in top_categories:
205
+ summary += f"- **{category.title()}**: {count} papers\n"
206
+
207
+ summary += "\n### Frequently Mentioned Drugs\n"
208
+
209
+ # Drug mentions
210
+ if analysis["drug_mentions"]:
211
+ for drug, count in list(analysis["drug_mentions"].items())[:5]:
212
+ summary += f"- **{drug}**: {count} mentions\n"
213
+
214
+ summary += "\n### Recent Highlights\n"
215
+
216
+ # Recent papers (last 2 years)
217
+ current_year = datetime.now().year
218
+ recent_papers = [p for p in papers if p.get("year", "").isdigit() and int(p["year"]) >= current_year - 2]
219
+
220
+ for paper in recent_papers[:3]:
221
+ summary += f"- **{paper.get('title', 'N/A')}** ({paper.get('year', 'N/A')})\n"
222
+ summary += f" *{paper.get('journal', 'N/A')}*\n\n"
223
+
224
+ return summary
225
+
226
+ def create_visualizations(self, analysis: Dict):
227
+ """Create visualization plots"""
228
+ plots = {}
229
+
230
+ # Year distribution
231
+ if analysis["year_distribution"]:
232
+ years = list(analysis["year_distribution"].keys())
233
+ counts = list(analysis["year_distribution"].values())
234
+
235
+ fig_year = px.bar(
236
+ x=years, y=counts,
237
+ title="Publication Year Distribution",
238
+ labels={"x": "Year", "y": "Number of Papers"}
239
+ )
240
+ plots["year_dist"] = fig_year
241
+
242
+ # Animal models
243
+ if analysis["animal_models"]:
244
+ models = list(analysis["animal_models"].keys())[:10]
245
+ model_counts = [analysis["animal_models"][m] for m in models]
246
+
247
+ fig_models = px.bar(
248
+ x=model_counts, y=models,
249
+ orientation='h',
250
+ title="Most Common Animal Models",
251
+ labels={"x": "Number of Papers", "y": "Animal Model"}
252
+ )
253
+ plots["animal_models"] = fig_models
254
+
255
+ # Research categories
256
+ if analysis["research_categories"]:
257
+ categories = list(analysis["research_categories"].keys())
258
+ cat_counts = list(analysis["research_categories"].values())
259
+
260
+ fig_categories = px.pie(
261
+ values=cat_counts, names=categories,
262
+ title="Research Focus Distribution"
263
+ )
264
+ plots["categories"] = fig_categories
265
+
266
+ return plots
267
+
268
+ def create_gradio_interface():
269
+ """Create the Gradio interface"""
270
+ miner = CancerResearchLiteratureMiner()
271
+
272
+ def search_and_analyze(query, max_results):
273
+ """Main function to search and analyze literature"""
274
+ if not query.strip():
275
+ return "Please enter a search query.", None, None, None, None
276
+
277
+ # Search papers
278
+ papers = miner.search_pubmed(query, max_results)
279
+
280
+ if not papers or papers[0].get("error"):
281
+ error_msg = papers[0].get("error", "No papers found") if papers else "No papers found"
282
+ return f"Error: {error_msg}", None, None, None, None
283
+
284
+ # Analyze papers
285
+ analysis = miner.analyze_papers(papers)
286
+
287
+ # Generate summary
288
+ summary = miner.generate_summary(papers, analysis)
289
+
290
+ # Create visualizations
291
+ plots = miner.create_visualizations(analysis)
292
+
293
+ # Create papers dataframe
294
+ papers_df = pd.DataFrame([
295
+ {
296
+ "PMID": p.get("pmid", "N/A"),
297
+ "Title": p.get("title", "N/A")[:100] + "..." if len(p.get("title", "")) > 100 else p.get("title", "N/A"),
298
+ "Authors": p.get("authors", "N/A"),
299
+ "Journal": p.get("journal", "N/A"),
300
+ "Year": p.get("year", "N/A")
301
+ }
302
+ for p in papers
303
+ ])
304
+
305
+ return (
306
+ summary,
307
+ papers_df,
308
+ plots.get("year_dist"),
309
+ plots.get("animal_models"),
310
+ plots.get("categories")
311
+ )
312
+
313
+ # Create interface
314
+ with gr.Blocks(title="Cancer Research Literature Mining Agent", theme=gr.themes.Soft()) as interface:
315
+ gr.Markdown("""
316
+ # πŸ”¬ Cancer Research Literature Mining Agent
317
+
318
+ This AI agent searches and analyzes scientific literature related to cancer research in animal models.
319
+ It automatically extracts insights about animal models used, research focus areas, and emerging trends.
320
+
321
+ **Features:**
322
+ - PubMed literature search with animal model focus
323
+ - Automatic categorization of research areas
324
+ - Drug mention extraction
325
+ - Publication trend analysis
326
+ - Interactive visualizations
327
+ """)
328
+
329
+ with gr.Row():
330
+ with gr.Column(scale=2):
331
+ query_input = gr.Textbox(
332
+ label="Research Query",
333
+ placeholder="e.g., 'breast cancer immunotherapy', 'lung cancer biomarkers', 'pancreatic cancer treatment'",
334
+ lines=2
335
+ )
336
+ max_results = gr.Slider(
337
+ minimum=10, maximum=100, value=50, step=10,
338
+ label="Maximum Results"
339
+ )
340
+ search_btn = gr.Button("πŸ” Search & Analyze Literature", variant="primary")
341
+
342
+ with gr.Column(scale=1):
343
+ gr.Markdown("""
344
+ ### Tips for Better Results:
345
+ - Use specific cancer types (e.g., "breast cancer", "melanoma")
346
+ - Include treatment modalities (e.g., "immunotherapy", "chemotherapy")
347
+ - Add animal model terms (e.g., "mouse model", "xenograft")
348
+ """)
349
+
350
+ with gr.Tabs():
351
+ with gr.TabItem("πŸ“Š Summary & Insights"):
352
+ summary_output = gr.Markdown(label="Analysis Summary")
353
+
354
+ with gr.TabItem("πŸ“‹ Papers Found"):
355
+ papers_output = gr.Dataframe(
356
+ headers=["PMID", "Title", "Authors", "Journal", "Year"],
357
+ label="Retrieved Papers"
358
+ )
359
+
360
+ with gr.TabItem("πŸ“ˆ Visualizations"):
361
+ with gr.Row():
362
+ year_plot = gr.Plot(label="Publication Timeline")
363
+ models_plot = gr.Plot(label="Animal Models")
364
+ with gr.Row():
365
+ categories_plot = gr.Plot(label="Research Categories")
366
+
367
+ # Connect the search function
368
+ search_btn.click(
369
+ search_and_analyze,
370
+ inputs=[query_input, max_results],
371
+ outputs=[summary_output, papers_output, year_plot, models_plot, categories_plot]
372
+ )
373
+
374
+ # Add examples
375
+ gr.Examples(
376
+ examples=[
377
+ ["breast cancer immunotherapy mouse model", 50],
378
+ ["lung cancer biomarkers xenograft", 30],
379
+ ["pancreatic cancer treatment PDX", 40],
380
+ ["melanoma drug resistance animal model", 35]
381
+ ],
382
+ inputs=[query_input, max_results]
383
+ )
384
+
385
+ gr.Markdown("""
386
+ ### About This Agent
387
+ This literature mining agent is specifically designed for cancer research in animal models.
388
+ It searches PubMed for relevant papers and provides automated analysis of research trends,
389
+ commonly used animal models, and emerging therapeutic approaches.
390
+
391
+ **Data Sources:** PubMed/NCBI databases
392
+ **Last Updated:** June 2025
393
+ **Supported Research Areas:** All cancer types and animal models
394
+ """)
395
+
396
+ return interface
397
+
398
+ # Create and launch the interface
399
+ if __name__ == "__main__":
400
+ interface = create_gradio_interface()
401
+ interface.launch(
402
+ server_name="0.0.0.0",
403
+ server_port=7860,
404
+ share=True
405
+ )