Spaces:

CCockrum
/

Oncological-Literature-Mining-Agent

Sleeping

App Files Files Community

CCockrum commited on Jun 17

Commit

abad8a2

verified ·

1 Parent(s): 034aafa

Create app.py

Browse files

Files changed (1) hide show

app.py +405 -0

app.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import gradio as gr
+import requests
+import json
+import pandas as pd
+from datetime import datetime, timedelta
+import re
+from typing import List, Dict, Tuple
+import xml.etree.ElementTree as ET
+from collections import Counter
+import plotly.express as px
+import plotly.graph_objects as go
+from transformers import pipeline
+import numpy as np
+class CancerResearchLiteratureMiner:
+    def __init__(self):
+        # Initialize NLP pipelines
+        try:
+            self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+            self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+        except Exception as e:
+            print(f"Warning: Could not load transformers models: {e}")
+            self.summarizer = None
+            self.classifier = None
+        # Research categories for classification
+        self.research_categories = [
+            "drug discovery", "immunotherapy", "chemotherapy", "radiation therapy",
+            "biomarkers", "diagnostics", "metastasis", "tumor microenvironment",
+            "animal models", "preclinical studies", "toxicity", "pharmacokinetics"
+        ]
+        # Animal model keywords
+        self.animal_keywords = [
+            "mouse", "mice", "rat", "rats", "xenograft", "orthotopic", "transgenic",
+            "knockout", "immunodeficient", "nude mice", "SCID", "NOD", "PDX",
+            "patient-derived xenograft", "syngeneic", "canine", "dog", "feline", "cat"
+        ]
+    def search_pubmed(self, query: str, max_results: int = 50) -> List[Dict]:
+        """Search PubMed for cancer research papers"""
+        # Enhance query with animal model terms
+        enhanced_query = f"({query}) AND (animal model OR mouse OR mice OR rat OR xenograft OR preclinical)"
+        # Search PubMed
+        search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+        search_params = {
+            "db": "pubmed",
+            "term": enhanced_query,
+            "retmax": max_results,
+            "retmode": "json",
+            "sort": "relevance"
+        }
+        try:
+            search_response = requests.get(search_url, params=search_params)
+            search_data = search_response.json()
+            if "esearchresult" not in search_data or not search_data["esearchresult"]["idlist"]:
+                return []
+            # Get detailed information
+            ids = search_data["esearchresult"]["idlist"]
+            fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+            fetch_params = {
+                "db": "pubmed",
+                "id": ",".join(ids),
+                "retmode": "xml"
+            }
+            fetch_response = requests.get(fetch_url, params=fetch_params)
+            # Parse XML response
+            papers = self._parse_pubmed_xml(fetch_response.text)
+            return papers
+        except Exception as e:
+            return [{"error": f"Search failed: {str(e)}"}]
+    def _parse_pubmed_xml(self, xml_content: str) -> List[Dict]:
+        """Parse PubMed XML response"""
+        papers = []
+        try:
+            root = ET.fromstring(xml_content)
+            for article in root.findall(".//PubmedArticle"):
+                paper = {}
+                # Extract basic info
+                medline = article.find(".//MedlineCitation")
+                if medline is not None:
+                    pmid = medline.find(".//PMID")
+                    paper["pmid"] = pmid.text if pmid is not None else "N/A"
+                # Extract title
+                title = article.find(".//ArticleTitle")
+                paper["title"] = title.text if title is not None else "N/A"
+                # Extract abstract
+                abstract_elem = article.find(".//Abstract/AbstractText")
+                paper["abstract"] = abstract_elem.text if abstract_elem is not None else "N/A"
+                # Extract authors
+                authors = []
+                for author in article.findall(".//Author"):
+                    fname = author.find(".//ForeName")
+                    lname = author.find(".//LastName")
+                    if fname is not None and lname is not None:
+                        authors.append(f"{fname.text} {lname.text}")
+                paper["authors"] = ", ".join(authors[:3]) + ("..." if len(authors) > 3 else "")
+                # Extract journal and date
+                journal = article.find(".//Journal/Title")
+                paper["journal"] = journal.text if journal is not None else "N/A"
+                pub_date = article.find(".//PubDate/Year")
+                paper["year"] = pub_date.text if pub_date is not None else "N/A"
+                papers.append(paper)
+        except Exception as e:
+            return [{"error": f"XML parsing failed: {str(e)}"}]
+        return papers
+    def analyze_papers(self, papers: List[Dict]) -> Dict:
+        """Analyze the retrieved papers for insights"""
+        if not papers or papers[0].get("error"):
+            return {"error": "No papers to analyze"}
+        analysis = {
+            "total_papers": len(papers),
+            "year_distribution": {},
+            "animal_models": {},
+            "research_categories": {},
+            "key_findings": [],
+            "drug_mentions": [],
+            "methodology_trends": {}
+        }
+        # Analyze each paper
+        for paper in papers:
+            # Year distribution
+            year = paper.get("year", "Unknown")
+            analysis["year_distribution"][year] = analysis["year_distribution"].get(year, 0) + 1
+            # Analyze abstract for animal models and categories
+            abstract = paper.get("abstract", "").lower()
+            title = paper.get("title", "").lower()
+            full_text = f"{title} {abstract}"
+            # Animal model detection
+            for animal in self.animal_keywords:
+                if animal in full_text:
+                    analysis["animal_models"][animal] = analysis["animal_models"].get(animal, 0) + 1
+            # Extract drug mentions (simple regex for common drug patterns)
+            drugs = re.findall(r'\b[A-Z][a-z]*(?:mab|nib|ine|ole|cin|tin)\b', paper.get("abstract", ""))
+            analysis["drug_mentions"].extend(drugs)
+            # Classify research category if classifier is available
+            if self.classifier and abstract != "n/a":
+                try:
+                    result = self.classifier(abstract[:512], self.research_categories)
+                    top_category = result["labels"][0]
+                    analysis["research_categories"][top_category] = analysis["research_categories"].get(top_category, 0) + 1
+                except Exception:
+                    pass
+        # Process drug mentions
+        drug_counter = Counter(analysis["drug_mentions"])
+        analysis["drug_mentions"] = dict(drug_counter.most_common(10))
+        return analysis
+    def generate_summary(self, papers: List[Dict], analysis: Dict) -> str:
+        """Generate a comprehensive summary of findings"""
+        if not papers or papers[0].get("error"):
+            return "No papers found or error in retrieval."
+        summary = f"""
+# Literature Mining Summary
+## Overview
+- **Total Papers Found**: {analysis['total_papers']}
+- **Search Date**: {datetime.now().strftime('%Y-%m-%d')}
+## Key Insights
+### Animal Models Used
+"""
+        # Top animal models
+        if analysis["animal_models"]:
+            top_models = sorted(analysis["animal_models"].items(), key=lambda x: x[1], reverse=True)[:5]
+            for model, count in top_models:
+                summary += f"- **{model.title()}**: {count} papers\n"
+        summary += "\n### Research Focus Areas\n"
+        # Research categories
+        if analysis["research_categories"]:
+            top_categories = sorted(analysis["research_categories"].items(), key=lambda x: x[1], reverse=True)[:5]
+            for category, count in top_categories:
+                summary += f"- **{category.title()}**: {count} papers\n"
+        summary += "\n### Frequently Mentioned Drugs\n"
+        # Drug mentions
+        if analysis["drug_mentions"]:
+            for drug, count in list(analysis["drug_mentions"].items())[:5]:
+                summary += f"- **{drug}**: {count} mentions\n"
+        summary += "\n### Recent Highlights\n"
+        # Recent papers (last 2 years)
+        current_year = datetime.now().year
+        recent_papers = [p for p in papers if p.get("year", "").isdigit() and int(p["year"]) >= current_year - 2]
+        for paper in recent_papers[:3]:
+            summary += f"- **{paper.get('title', 'N/A')}** ({paper.get('year', 'N/A')})\n"
+            summary += f"  *{paper.get('journal', 'N/A')}*\n\n"
+        return summary
+    def create_visualizations(self, analysis: Dict):
+        """Create visualization plots"""
+        plots = {}
+        # Year distribution
+        if analysis["year_distribution"]:
+            years = list(analysis["year_distribution"].keys())
+            counts = list(analysis["year_distribution"].values())
+            fig_year = px.bar(
+                x=years, y=counts,
+                title="Publication Year Distribution",
+                labels={"x": "Year", "y": "Number of Papers"}
+            )
+            plots["year_dist"] = fig_year
+        # Animal models
+        if analysis["animal_models"]:
+            models = list(analysis["animal_models"].keys())[:10]
+            model_counts = [analysis["animal_models"][m] for m in models]
+            fig_models = px.bar(
+                x=model_counts, y=models,
+                orientation='h',
+                title="Most Common Animal Models",
+                labels={"x": "Number of Papers", "y": "Animal Model"}
+            )
+            plots["animal_models"] = fig_models
+        # Research categories
+        if analysis["research_categories"]:
+            categories = list(analysis["research_categories"].keys())
+            cat_counts = list(analysis["research_categories"].values())
+            fig_categories = px.pie(
+                values=cat_counts, names=categories,
+                title="Research Focus Distribution"
+            )
+            plots["categories"] = fig_categories
+        return plots
+def create_gradio_interface():
+    """Create the Gradio interface"""
+    miner = CancerResearchLiteratureMiner()
+    def search_and_analyze(query, max_results):
+        """Main function to search and analyze literature"""
+        if not query.strip():
+            return "Please enter a search query.", None, None, None, None
+        # Search papers
+        papers = miner.search_pubmed(query, max_results)
+        if not papers or papers[0].get("error"):
+            error_msg = papers[0].get("error", "No papers found") if papers else "No papers found"
+            return f"Error: {error_msg}", None, None, None, None
+        # Analyze papers
+        analysis = miner.analyze_papers(papers)
+        # Generate summary
+        summary = miner.generate_summary(papers, analysis)
+        # Create visualizations
+        plots = miner.create_visualizations(analysis)
+        # Create papers dataframe
+        papers_df = pd.DataFrame([
+            {
+                "PMID": p.get("pmid", "N/A"),
+                "Title": p.get("title", "N/A")[:100] + "..." if len(p.get("title", "")) > 100 else p.get("title", "N/A"),
+                "Authors": p.get("authors", "N/A"),
+                "Journal": p.get("journal", "N/A"),
+                "Year": p.get("year", "N/A")
+            }
+            for p in papers
+        ])
+        return (
+            summary,
+            papers_df,
+            plots.get("year_dist"),
+            plots.get("animal_models"),
+            plots.get("categories")
+        )
+    # Create interface
+    with gr.Blocks(title="Cancer Research Literature Mining Agent", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        # 🔬 Cancer Research Literature Mining Agent
+        This AI agent searches and analyzes scientific literature related to cancer research in animal models.
+        It automatically extracts insights about animal models used, research focus areas, and emerging trends.
+        **Features:**
+        - PubMed literature search with animal model focus
+        - Automatic categorization of research areas
+        - Drug mention extraction
+        - Publication trend analysis
+        - Interactive visualizations
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                query_input = gr.Textbox(
+                    label="Research Query",
+                    placeholder="e.g., 'breast cancer immunotherapy', 'lung cancer biomarkers', 'pancreatic cancer treatment'",
+                    lines=2
+                )
+                max_results = gr.Slider(
+                    minimum=10, maximum=100, value=50, step=10,
+                    label="Maximum Results"
+                )
+                search_btn = gr.Button("🔍 Search & Analyze Literature", variant="primary")
+            with gr.Column(scale=1):
+                gr.Markdown("""
+                ### Tips for Better Results:
+                - Use specific cancer types (e.g., "breast cancer", "melanoma")
+                - Include treatment modalities (e.g., "immunotherapy", "chemotherapy")
+                - Add animal model terms (e.g., "mouse model", "xenograft")
+                """)
+        with gr.Tabs():
+            with gr.TabItem("📊 Summary & Insights"):
+                summary_output = gr.Markdown(label="Analysis Summary")
+            with gr.TabItem("📋 Papers Found"):
+                papers_output = gr.Dataframe(
+                    headers=["PMID", "Title", "Authors", "Journal", "Year"],
+                    label="Retrieved Papers"
+                )
+            with gr.TabItem("📈 Visualizations"):
+                with gr.Row():
+                    year_plot = gr.Plot(label="Publication Timeline")
+                    models_plot = gr.Plot(label="Animal Models")
+                with gr.Row():
+                    categories_plot = gr.Plot(label="Research Categories")
+        # Connect the search function
+        search_btn.click(
+            search_and_analyze,
+            inputs=[query_input, max_results],
+            outputs=[summary_output, papers_output, year_plot, models_plot, categories_plot]
+        )
+        # Add examples
+        gr.Examples(
+            examples=[
+                ["breast cancer immunotherapy mouse model", 50],
+                ["lung cancer biomarkers xenograft", 30],
+                ["pancreatic cancer treatment PDX", 40],
+                ["melanoma drug resistance animal model", 35]
+            ],
+            inputs=[query_input, max_results]
+        )
+        gr.Markdown("""
+        ### About This Agent
+        This literature mining agent is specifically designed for cancer research in animal models.
+        It searches PubMed for relevant papers and provides automated analysis of research trends,
+        commonly used animal models, and emerging therapeutic approaches.
+        **Data Sources:** PubMed/NCBI databases
+        **Last Updated:** June 2025
+        **Supported Research Areas:** All cancer types and animal models
+        """)
+    return interface
+# Create and launch the interface
+if __name__ == "__main__":
+    interface = create_gradio_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )