Spaces:
Sleeping
Sleeping
Update api_clients/pubmed_client.py
Browse files- api_clients/pubmed_client.py +105 -0
api_clients/pubmed_client.py
CHANGED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api_clients/pubmed_client.py
|
| 2 |
+
"""
|
| 3 |
+
Client for the PubMed API via NCBI's Entrez E-utilities.
|
| 4 |
+
This module is expertly crafted to perform a two-step search: first finding
|
| 5 |
+
relevant article IDs (PMIDs) and then fetching their structured summaries.
|
| 6 |
+
It intelligently prioritizes review articles to provide high-quality,
|
| 7 |
+
synthesized information to the main orchestrator.
|
| 8 |
+
"""
|
| 9 |
+
import aiohttp
|
| 10 |
+
from .config import PUBMED_BASE_URL, REQUEST_HEADERS
|
| 11 |
+
|
| 12 |
+
async def search_pubmed(session: aiohttp.ClientSession, query: str, max_results: int = 5) -> list[dict]:
|
| 13 |
+
"""
|
| 14 |
+
Searches PubMed and returns a list of article summaries.
|
| 15 |
+
|
| 16 |
+
This function implements an intelligent search strategy:
|
| 17 |
+
1. It searches for article IDs (PMIDs) matching the query within the title/abstract.
|
| 18 |
+
2. It specifically filters for "review" articles, which are ideal for summarization.
|
| 19 |
+
3. It then fetches concise summaries for the found PMIDs.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
session (aiohttp.ClientSession): The active HTTP session.
|
| 23 |
+
query (str): The search term, likely a combination of concepts (e.g., "Migraine AND Aura").
|
| 24 |
+
max_results (int): The maximum number of article summaries to return.
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
list[dict]: A list of dictionaries, each containing summary data for an article.
|
| 28 |
+
Returns an empty list if no results are found or an error occurs.
|
| 29 |
+
"""
|
| 30 |
+
if not query:
|
| 31 |
+
return []
|
| 32 |
+
|
| 33 |
+
# --- Step 1: ESearch - Find relevant article PMIDs ---
|
| 34 |
+
# We construct a powerful query to get the most relevant results.
|
| 35 |
+
# - `[Title/Abstract]`: Focuses the search on the most important parts of the paper.
|
| 36 |
+
# - `AND review[Publication Type]`: Narrows results to high-value review articles.
|
| 37 |
+
# - `sort=relevance`: Ensures the best matches appear first.
|
| 38 |
+
search_term = f"({query}) AND review[Publication Type]"
|
| 39 |
+
|
| 40 |
+
esearch_params = {
|
| 41 |
+
'db': 'pubmed',
|
| 42 |
+
'term': search_term,
|
| 43 |
+
'retmode': 'json',
|
| 44 |
+
'retmax': max_results,
|
| 45 |
+
'sort': 'relevance'
|
| 46 |
+
}
|
| 47 |
+
esearch_url = f"{PUBMED_BASE_URL}/esearch.fcgi"
|
| 48 |
+
|
| 49 |
+
pmids = []
|
| 50 |
+
try:
|
| 51 |
+
async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS, timeout=10) as resp:
|
| 52 |
+
resp.raise_for_status()
|
| 53 |
+
data = await resp.json()
|
| 54 |
+
pmids = data.get('esearchresult', {}).get('idlist', [])
|
| 55 |
+
|
| 56 |
+
if not pmids:
|
| 57 |
+
# If no review articles are found, try a broader search as a fallback
|
| 58 |
+
print(f"No review articles found for '{query}'. Broadening search...")
|
| 59 |
+
esearch_params['term'] = query # Remove the review filter
|
| 60 |
+
async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS) as fallback_resp:
|
| 61 |
+
fallback_resp.raise_for_status()
|
| 62 |
+
fallback_data = await fallback_resp.json()
|
| 63 |
+
pmids = fallback_data.get('esearchresult', {}).get('idlist', [])
|
| 64 |
+
|
| 65 |
+
if not pmids:
|
| 66 |
+
print(f"No PubMed results found for query: {query}")
|
| 67 |
+
return []
|
| 68 |
+
|
| 69 |
+
# --- Step 2: ESummary - Fetch summaries for the found PMIDs ---
|
| 70 |
+
esummary_params = {
|
| 71 |
+
'db': 'pubmed',
|
| 72 |
+
'id': ",".join(pmids), # E-utilities can take a comma-separated list of IDs
|
| 73 |
+
'retmode': 'json'
|
| 74 |
+
}
|
| 75 |
+
esummary_url = f"{PUBMED_BASE_URL}/esummary.fcgi"
|
| 76 |
+
|
| 77 |
+
async with session.get(esummary_url, params=esummary_params, headers=REQUEST_HEADERS, timeout=15) as resp:
|
| 78 |
+
resp.raise_for_status()
|
| 79 |
+
summary_data = await resp.json()
|
| 80 |
+
|
| 81 |
+
# The result is a dict with a 'result' key, which contains another dict
|
| 82 |
+
# where keys are the PMIDs. We'll parse this into a clean list.
|
| 83 |
+
results = summary_data.get('result', {})
|
| 84 |
+
|
| 85 |
+
# A robust way to parse, ensuring order and handling missing data
|
| 86 |
+
parsed_articles = []
|
| 87 |
+
for pmid in pmids:
|
| 88 |
+
if pmid in results:
|
| 89 |
+
article = results[pmid]
|
| 90 |
+
parsed_articles.append({
|
| 91 |
+
'uid': article.get('uid', pmid),
|
| 92 |
+
'title': article.get('title', 'Title Not Available'),
|
| 93 |
+
'pubdate': article.get('pubdate', 'N/A'),
|
| 94 |
+
'authors': [author['name'] for author in article.get('authors', [])],
|
| 95 |
+
'journal': article.get('source', 'N/A'),
|
| 96 |
+
'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
| 97 |
+
})
|
| 98 |
+
return parsed_articles
|
| 99 |
+
|
| 100 |
+
except aiohttp.ClientError as e:
|
| 101 |
+
print(f"An error occurred while fetching from PubMed: {e}")
|
| 102 |
+
return []
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"A general error occurred in the pubmed_client: {e}")
|
| 105 |
+
return []
|