|
from typing import List |
|
import requests |
|
import pandas as pd |
|
import time |
|
|
|
from datasets import load_dataset |
|
from requests.adapters import HTTPAdapter |
|
from urllib3.util.retry import Retry |
|
|
|
|
|
|
|
session = requests.Session() |
|
retries = Retry( |
|
total=5, |
|
backoff_factor=1, |
|
status_forcelist=[502, 503, 504], |
|
allowed_methods=["GET"], |
|
) |
|
session.mount("https://", HTTPAdapter(max_retries=retries)) |
|
|
|
|
|
def get_faers_adverse_events(drug_name: str, limit: int = 100) -> List: |
|
""" |
|
Fetch and parse adverse event reports for a drug from openFDA FAERS. |
|
Adds serious outcome flags and event context fields. |
|
Args: |
|
drug_name (str): The name of the drug to search for. |
|
limit (int): The maximum number of records to fetch. |
|
Returns: |
|
List: A list of dictionaries containing adverse event data. |
|
""" |
|
print(f"Fetching FAERS data for: {drug_name}") |
|
url = "https://api.fda.gov/drug/event.json" |
|
params = {"search": f'patient.drug.medicinalproduct:"{drug_name}"', "limit": limit} |
|
|
|
try: |
|
response = requests.get(url, params=params, timeout=10) |
|
response.raise_for_status() |
|
results = response.json().get("results", []) |
|
|
|
|
|
events = [] |
|
for entry in results: |
|
patient = entry.get("patient", {}) |
|
reactions = [r.get("reactionmeddrapt") for r in patient.get("reaction", [])] |
|
|
|
event = { |
|
"drug_name": drug_name, |
|
"safetyreportid": entry.get("safetyreportid"), |
|
"serious": entry.get("serious"), |
|
"seriousnessdeath": entry.get("seriousnessdeath"), |
|
"seriousnesshospitalization": entry.get("seriousnesshospitalization"), |
|
"seriousnessdisabling": entry.get("seriousnessdisabling"), |
|
"seriousnesslifethreatening": entry.get("seriousnesslifethreatening"), |
|
"seriousnesscongenitalanomali": entry.get("seriousnesscongenitalanomali"), |
|
"reactions": reactions, |
|
"receivedate": entry.get("receivedate"), |
|
"occurcountry": entry.get("occurcountry"), |
|
} |
|
events.append(event) |
|
print(f"Fetched {len(events)} events for {drug_name}") |
|
|
|
return events |
|
|
|
except requests.exceptions.RequestException as e: |
|
print(f"Failed to fetch data for {drug_name}: {e}") |
|
return [] |
|
|
|
|
|
def pull_faers_data(drug_list: List[str], output_filename_prefix: str) -> pd.DataFrame: |
|
""" |
|
Pull FAERS data for a list of drugs and save to CSV. |
|
Args: |
|
drug_list (List[str]): List of drug names to fetch data for. |
|
""" |
|
all_events = [] |
|
for drug in drug_list: |
|
events = get_faers_adverse_events(drug, limit=100) |
|
all_events.extend(events) |
|
time.sleep(1) |
|
|
|
|
|
df = pd.DataFrame(all_events) |
|
df.to_csv(f"{output_filename_prefix}_faers_adverse_events.csv", index=False) |
|
print(df.head(), df.shape) |
|
return df |
|
|
|
|
|
|
|
drug_list = ["pembrolizumab", "nivolumab", "ipilimumab"] |
|
|
|
|
|
pull_faers_data(drug_list=drug_list, output_filename_prefix="test") |
|
|
|
|
|
|
|
ds = load_dataset("tahoebio/Tahoe-100M", "drug_metadata") |
|
drug_metadata = ds["train"].to_pandas() |
|
|
|
|
|
tahoe_drug_list = drug_metadata["drug"].unique().tolist() |
|
adverse_events = pull_faers_data(drug_list=tahoe_drug_list, output_filename_prefix="tahoe_drugs") |
|
|
|
|
|
|
|
def compute_severity_score(df: pd.DataFrame) -> float: |
|
weights = { |
|
"seriousnessdeath": 5, |
|
"seriousnesslifethreatening": 4, |
|
"seriousnesshospitalization": 3, |
|
"seriousnessdisabling": 2, |
|
"seriousnesscongenitalanomali": 1, |
|
} |
|
score = 0 |
|
for k, w in weights.items(): |
|
score += df[k].fillna("0").astype(int).sum() * w |
|
return score |
|
|
|
|
|
def compute_seriousness_ratio(df: pd.DataFrame) -> float: |
|
""" |
|
Compute the ratio of serious to non-serious adverse events. |
|
""" |
|
seriousness_ae_ratio = ( |
|
df.groupby("drug_name")["serious"] |
|
.apply(lambda x: x.fillna("0").astype(int).mean()) |
|
.reset_index(name="mean_serious_flag") |
|
) |
|
return seriousness_ae_ratio |
|
|
|
|
|
ae_score_df = adverse_events.groupby("drug_name").apply(compute_severity_score).reset_index(name="ae_severity_score") |
|
seriousness_ratio = compute_seriousness_ratio(adverse_events) |
|
|
|
|
|
ae_score_df.to_csv("tahoe_drug_ae_severity_score.csv", index=False) |
|
seriousness_ratio.to_csv("tahoe_drug_ae_seriousness_ratio.csv", index=False) |
|
|