emdann's picture
Upload folder using huggingface_hub
032c0ea verified
from typing import List
import requests
import pandas as pd
import time
from datasets import load_dataset
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# if getting bad gateway errors, try:
# Retry session setup
session = requests.Session()
retries = Retry(
total=5, # Retry up to 5 times
backoff_factor=1, # Wait 1s, then 2s, then 4s...
status_forcelist=[502, 503, 504],
allowed_methods=["GET"],
)
session.mount("https://", HTTPAdapter(max_retries=retries))
def get_faers_adverse_events(drug_name: str, limit: int = 100) -> List:
"""
Fetch and parse adverse event reports for a drug from openFDA FAERS.
Adds serious outcome flags and event context fields.
Args:
drug_name (str): The name of the drug to search for.
limit (int): The maximum number of records to fetch.
Returns:
List: A list of dictionaries containing adverse event data.
"""
print(f"Fetching FAERS data for: {drug_name}")
url = "https://api.fda.gov/drug/event.json"
params = {"search": f'patient.drug.medicinalproduct:"{drug_name}"', "limit": limit}
try:
response = requests.get(url, params=params, timeout=10)
response.raise_for_status()
results = response.json().get("results", [])
# Extract detailed adverse event fields
events = []
for entry in results:
patient = entry.get("patient", {})
reactions = [r.get("reactionmeddrapt") for r in patient.get("reaction", [])]
event = {
"drug_name": drug_name,
"safetyreportid": entry.get("safetyreportid"),
"serious": entry.get("serious"),
"seriousnessdeath": entry.get("seriousnessdeath"),
"seriousnesshospitalization": entry.get("seriousnesshospitalization"),
"seriousnessdisabling": entry.get("seriousnessdisabling"),
"seriousnesslifethreatening": entry.get("seriousnesslifethreatening"),
"seriousnesscongenitalanomali": entry.get("seriousnesscongenitalanomali"),
"reactions": reactions,
"receivedate": entry.get("receivedate"),
"occurcountry": entry.get("occurcountry"),
}
events.append(event)
print(f"Fetched {len(events)} events for {drug_name}")
return events
except requests.exceptions.RequestException as e:
print(f"Failed to fetch data for {drug_name}: {e}")
return []
def pull_faers_data(drug_list: List[str], output_filename_prefix: str) -> pd.DataFrame:
"""
Pull FAERS data for a list of drugs and save to CSV.
Args:
drug_list (List[str]): List of drug names to fetch data for.
"""
all_events = []
for drug in drug_list:
events = get_faers_adverse_events(drug, limit=100)
all_events.extend(events)
time.sleep(1) # be polite to openFDA rate limits
# Convert to DataFrame and show or save
df = pd.DataFrame(all_events)
df.to_csv(f"{output_filename_prefix}_faers_adverse_events.csv", index=False)
print(df.head(), df.shape)
return df
# example drug list
drug_list = ["pembrolizumab", "nivolumab", "ipilimumab"]
# test out fx
pull_faers_data(drug_list=drug_list, output_filename_prefix="test")
# load tahoe drug data metadata, gather the AEs for each unique drug
ds = load_dataset("tahoebio/Tahoe-100M", "drug_metadata")
drug_metadata = ds["train"].to_pandas()
# get adverse events for each unique drug in the tahoe dataset
tahoe_drug_list = drug_metadata["drug"].unique().tolist()
adverse_events = pull_faers_data(drug_list=tahoe_drug_list, output_filename_prefix="tahoe_drugs")
# adverse_events = pd.read_csv("tahoe_drugs_faers_adverse_events.csv")
# summarize the adverse events for each drug so we can try to predict the severity of a drug
def compute_severity_score(df: pd.DataFrame) -> float:
weights = {
"seriousnessdeath": 5,
"seriousnesslifethreatening": 4,
"seriousnesshospitalization": 3,
"seriousnessdisabling": 2,
"seriousnesscongenitalanomali": 1,
}
score = 0
for k, w in weights.items():
score += df[k].fillna("0").astype(int).sum() * w
return score
def compute_seriousness_ratio(df: pd.DataFrame) -> float:
"""
Compute the ratio of serious to non-serious adverse events.
"""
seriousness_ae_ratio = (
df.groupby("drug_name")["serious"]
.apply(lambda x: x.fillna("0").astype(int).mean())
.reset_index(name="mean_serious_flag")
)
return seriousness_ae_ratio
ae_score_df = adverse_events.groupby("drug_name").apply(compute_severity_score).reset_index(name="ae_severity_score")
seriousness_ratio = compute_seriousness_ratio(adverse_events)
# write out outputs
ae_score_df.to_csv("tahoe_drug_ae_severity_score.csv", index=False)
seriousness_ratio.to_csv("tahoe_drug_ae_seriousness_ratio.csv", index=False)