|
import gradio as gr |
|
import requests |
|
import json |
|
import pandas as pd |
|
from datetime import datetime, timedelta |
|
import re |
|
from typing import List, Dict, Tuple |
|
import xml.etree.ElementTree as ET |
|
from collections import Counter |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from transformers import pipeline |
|
import numpy as np |
|
|
|
class CancerResearchLiteratureMiner: |
|
def __init__(self): |
|
|
|
try: |
|
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") |
|
except Exception as e: |
|
print(f"Warning: Could not load transformers models: {e}") |
|
self.summarizer = None |
|
self.classifier = None |
|
|
|
|
|
self.research_categories = [ |
|
"drug discovery", "immunotherapy", "chemotherapy", "radiation therapy", |
|
"biomarkers", "diagnostics", "metastasis", "tumor microenvironment", |
|
"animal models", "preclinical studies", "toxicity", "pharmacokinetics" |
|
] |
|
|
|
|
|
self.animal_keywords = [ |
|
"mouse", "mice", "rat", "rats", "xenograft", "orthotopic", "transgenic", |
|
"knockout", "immunodeficient", "nude mice", "SCID", "NOD", "PDX", |
|
"patient-derived xenograft", "syngeneic", "canine", "dog", "feline", "cat" |
|
] |
|
|
|
def search_pubmed(self, query: str, max_results: int = 50) -> List[Dict]: |
|
"""Search PubMed for cancer research papers""" |
|
|
|
enhanced_query = f"({query}) AND (animal model OR mouse OR mice OR rat OR xenograft OR preclinical)" |
|
|
|
|
|
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
|
search_params = { |
|
"db": "pubmed", |
|
"term": enhanced_query, |
|
"retmax": max_results, |
|
"retmode": "json", |
|
"sort": "relevance" |
|
} |
|
|
|
try: |
|
search_response = requests.get(search_url, params=search_params) |
|
search_data = search_response.json() |
|
|
|
if "esearchresult" not in search_data or not search_data["esearchresult"]["idlist"]: |
|
return [] |
|
|
|
|
|
ids = search_data["esearchresult"]["idlist"] |
|
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" |
|
fetch_params = { |
|
"db": "pubmed", |
|
"id": ",".join(ids), |
|
"retmode": "xml" |
|
} |
|
|
|
fetch_response = requests.get(fetch_url, params=fetch_params) |
|
|
|
|
|
papers = self._parse_pubmed_xml(fetch_response.text) |
|
return papers |
|
|
|
except Exception as e: |
|
return [{"error": f"Search failed: {str(e)}"}] |
|
|
|
def _parse_pubmed_xml(self, xml_content: str) -> List[Dict]: |
|
"""Parse PubMed XML response""" |
|
papers = [] |
|
try: |
|
root = ET.fromstring(xml_content) |
|
|
|
for article in root.findall(".//PubmedArticle"): |
|
paper = {} |
|
|
|
|
|
medline = article.find(".//MedlineCitation") |
|
if medline is not None: |
|
pmid = medline.find(".//PMID") |
|
paper["pmid"] = pmid.text if pmid is not None else "N/A" |
|
|
|
|
|
title = article.find(".//ArticleTitle") |
|
paper["title"] = title.text if title is not None else "N/A" |
|
|
|
|
|
abstract_elem = article.find(".//Abstract/AbstractText") |
|
paper["abstract"] = abstract_elem.text if abstract_elem is not None else "N/A" |
|
|
|
|
|
authors = [] |
|
for author in article.findall(".//Author"): |
|
fname = author.find(".//ForeName") |
|
lname = author.find(".//LastName") |
|
if fname is not None and lname is not None: |
|
authors.append(f"{fname.text} {lname.text}") |
|
paper["authors"] = ", ".join(authors[:3]) + ("..." if len(authors) > 3 else "") |
|
|
|
|
|
journal = article.find(".//Journal/Title") |
|
paper["journal"] = journal.text if journal is not None else "N/A" |
|
|
|
pub_date = article.find(".//PubDate/Year") |
|
paper["year"] = pub_date.text if pub_date is not None else "N/A" |
|
|
|
papers.append(paper) |
|
|
|
except Exception as e: |
|
return [{"error": f"XML parsing failed: {str(e)}"}] |
|
|
|
return papers |
|
|
|
def analyze_papers(self, papers: List[Dict]) -> Dict: |
|
"""Analyze the retrieved papers for insights""" |
|
if not papers or papers[0].get("error"): |
|
return {"error": "No papers to analyze"} |
|
|
|
analysis = { |
|
"total_papers": len(papers), |
|
"year_distribution": {}, |
|
"animal_models": {}, |
|
"research_categories": {}, |
|
"key_findings": [], |
|
"drug_mentions": [], |
|
"methodology_trends": {} |
|
} |
|
|
|
|
|
for paper in papers: |
|
|
|
year = paper.get("year", "Unknown") |
|
analysis["year_distribution"][year] = analysis["year_distribution"].get(year, 0) + 1 |
|
|
|
|
|
abstract = paper.get("abstract", "").lower() |
|
title = paper.get("title", "").lower() |
|
full_text = f"{title} {abstract}" |
|
|
|
|
|
for animal in self.animal_keywords: |
|
if animal in full_text: |
|
analysis["animal_models"][animal] = analysis["animal_models"].get(animal, 0) + 1 |
|
|
|
|
|
drugs = re.findall(r'\b[A-Z][a-z]*(?:mab|nib|ine|ole|cin|tin)\b', paper.get("abstract", "")) |
|
analysis["drug_mentions"].extend(drugs) |
|
|
|
|
|
if self.classifier and abstract != "n/a": |
|
try: |
|
result = self.classifier(abstract[:512], self.research_categories) |
|
top_category = result["labels"][0] |
|
analysis["research_categories"][top_category] = analysis["research_categories"].get(top_category, 0) + 1 |
|
except Exception: |
|
pass |
|
|
|
|
|
drug_counter = Counter(analysis["drug_mentions"]) |
|
analysis["drug_mentions"] = dict(drug_counter.most_common(10)) |
|
|
|
return analysis |
|
|
|
def generate_summary(self, papers: List[Dict], analysis: Dict) -> str: |
|
"""Generate a comprehensive summary of findings""" |
|
if not papers or papers[0].get("error"): |
|
return "No papers found or error in retrieval." |
|
|
|
summary = f""" |
|
# Literature Mining Summary |
|
|
|
## Overview |
|
- **Total Papers Found**: {analysis['total_papers']} |
|
- **Search Date**: {datetime.now().strftime('%Y-%m-%d')} |
|
|
|
## Key Insights |
|
|
|
### Animal Models Used |
|
""" |
|
|
|
|
|
if analysis["animal_models"]: |
|
top_models = sorted(analysis["animal_models"].items(), key=lambda x: x[1], reverse=True)[:5] |
|
for model, count in top_models: |
|
summary += f"- **{model.title()}**: {count} papers\n" |
|
|
|
summary += "\n### Research Focus Areas\n" |
|
|
|
|
|
if analysis["research_categories"]: |
|
top_categories = sorted(analysis["research_categories"].items(), key=lambda x: x[1], reverse=True)[:5] |
|
for category, count in top_categories: |
|
summary += f"- **{category.title()}**: {count} papers\n" |
|
|
|
summary += "\n### Frequently Mentioned Drugs\n" |
|
|
|
|
|
if analysis["drug_mentions"]: |
|
for drug, count in list(analysis["drug_mentions"].items())[:5]: |
|
summary += f"- **{drug}**: {count} mentions\n" |
|
|
|
summary += "\n### Recent Highlights\n" |
|
|
|
|
|
current_year = datetime.now().year |
|
recent_papers = [p for p in papers if p.get("year", "").isdigit() and int(p["year"]) >= current_year - 2] |
|
|
|
for paper in recent_papers[:3]: |
|
summary += f"- **{paper.get('title', 'N/A')}** ({paper.get('year', 'N/A')})\n" |
|
summary += f" *{paper.get('journal', 'N/A')}*\n\n" |
|
|
|
return summary |
|
|
|
def create_visualizations(self, analysis: Dict): |
|
"""Create visualization plots""" |
|
plots = {} |
|
|
|
|
|
if analysis["year_distribution"]: |
|
years = list(analysis["year_distribution"].keys()) |
|
counts = list(analysis["year_distribution"].values()) |
|
|
|
fig_year = px.bar( |
|
x=years, y=counts, |
|
title="Publication Year Distribution", |
|
labels={"x": "Year", "y": "Number of Papers"} |
|
) |
|
plots["year_dist"] = fig_year |
|
|
|
|
|
if analysis["animal_models"]: |
|
models = list(analysis["animal_models"].keys())[:10] |
|
model_counts = [analysis["animal_models"][m] for m in models] |
|
|
|
fig_models = px.bar( |
|
x=model_counts, y=models, |
|
orientation='h', |
|
title="Most Common Animal Models", |
|
labels={"x": "Number of Papers", "y": "Animal Model"} |
|
) |
|
plots["animal_models"] = fig_models |
|
|
|
|
|
if analysis["research_categories"]: |
|
categories = list(analysis["research_categories"].keys()) |
|
cat_counts = list(analysis["research_categories"].values()) |
|
|
|
fig_categories = px.pie( |
|
values=cat_counts, names=categories, |
|
title="Research Focus Distribution" |
|
) |
|
plots["categories"] = fig_categories |
|
|
|
return plots |
|
|
|
def create_gradio_interface(): |
|
"""Create the Gradio interface""" |
|
miner = CancerResearchLiteratureMiner() |
|
|
|
def search_and_analyze(query, max_results): |
|
"""Main function to search and analyze literature""" |
|
if not query.strip(): |
|
return "Please enter a search query.", None, None, None, None |
|
|
|
|
|
papers = miner.search_pubmed(query, max_results) |
|
|
|
if not papers or papers[0].get("error"): |
|
error_msg = papers[0].get("error", "No papers found") if papers else "No papers found" |
|
return f"Error: {error_msg}", None, None, None, None |
|
|
|
|
|
analysis = miner.analyze_papers(papers) |
|
|
|
|
|
summary = miner.generate_summary(papers, analysis) |
|
|
|
|
|
plots = miner.create_visualizations(analysis) |
|
|
|
|
|
papers_df = pd.DataFrame([ |
|
{ |
|
"PMID": p.get("pmid", "N/A"), |
|
"Title": p.get("title", "N/A")[:100] + "..." if len(p.get("title", "")) > 100 else p.get("title", "N/A"), |
|
"Authors": p.get("authors", "N/A"), |
|
"Journal": p.get("journal", "N/A"), |
|
"Year": p.get("year", "N/A") |
|
} |
|
for p in papers |
|
]) |
|
|
|
return ( |
|
summary, |
|
papers_df, |
|
plots.get("year_dist"), |
|
plots.get("animal_models"), |
|
plots.get("categories") |
|
) |
|
|
|
|
|
with gr.Blocks(title="Cancer Research Literature Mining Agent", theme=gr.themes.Soft()) as interface: |
|
gr.Markdown(""" |
|
# π¬ Cancer Research Literature Mining Agent |
|
|
|
This AI agent searches and analyzes scientific literature related to cancer research in animal models. |
|
It automatically extracts insights about animal models used, research focus areas, and emerging trends. |
|
|
|
**Features:** |
|
- PubMed literature search with animal model focus |
|
- Automatic categorization of research areas |
|
- Drug mention extraction |
|
- Publication trend analysis |
|
- Interactive visualizations |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
query_input = gr.Textbox( |
|
label="Research Query", |
|
placeholder="e.g., 'breast cancer immunotherapy', 'lung cancer biomarkers', 'pancreatic cancer treatment'", |
|
lines=2 |
|
) |
|
max_results = gr.Slider( |
|
minimum=10, maximum=100, value=50, step=10, |
|
label="Maximum Results" |
|
) |
|
search_btn = gr.Button("π Search & Analyze Literature", variant="primary") |
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown(""" |
|
### Tips for Better Results: |
|
- Use specific cancer types (e.g., "breast cancer", "melanoma") |
|
- Include treatment modalities (e.g., "immunotherapy", "chemotherapy") |
|
- Add animal model terms (e.g., "mouse model", "xenograft") |
|
""") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("π Summary & Insights"): |
|
summary_output = gr.Markdown(label="Analysis Summary") |
|
|
|
with gr.TabItem("π Papers Found"): |
|
papers_output = gr.Dataframe( |
|
headers=["PMID", "Title", "Authors", "Journal", "Year"], |
|
label="Retrieved Papers" |
|
) |
|
|
|
with gr.TabItem("π Visualizations"): |
|
with gr.Row(): |
|
year_plot = gr.Plot(label="Publication Timeline") |
|
models_plot = gr.Plot(label="Animal Models") |
|
with gr.Row(): |
|
categories_plot = gr.Plot(label="Research Categories") |
|
|
|
|
|
search_btn.click( |
|
search_and_analyze, |
|
inputs=[query_input, max_results], |
|
outputs=[summary_output, papers_output, year_plot, models_plot, categories_plot] |
|
) |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
["breast cancer immunotherapy mouse model", 50], |
|
["lung cancer biomarkers xenograft", 30], |
|
["pancreatic cancer treatment PDX", 40], |
|
["melanoma drug resistance animal model", 35] |
|
], |
|
inputs=[query_input, max_results] |
|
) |
|
|
|
gr.Markdown(""" |
|
### About This Agent |
|
This literature mining agent is specifically designed for cancer research in animal models. |
|
It searches PubMed for relevant papers and provides automated analysis of research trends, |
|
commonly used animal models, and emerging therapeutic approaches. |
|
|
|
**Data Sources:** PubMed/NCBI databases |
|
**Last Updated:** June 2025 |
|
**Supported Research Areas:** All cancer types and animal models |
|
""") |
|
|
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
interface = create_gradio_interface() |
|
interface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=True |
|
) |