{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get clinical trial info for drugs in Tahoe dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Autoreload extension\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# !pip install huggingface\n", "# !pip install datasets" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "import json\n", "import time\n", "from typing import List, Dict, Any, Union, Optional\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import numpy as np\n", "\n", "from drug_trials_extractor import DrugTrialExtractor\n", "from drug_trials_extractor import standardize_medical_conditions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/emmadann/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "# Login using e.g. `huggingface-cli login` to access this dataset\n", "\n", "ds = load_dataset(\"tahoebio/Tahoe-100M\", \"drug_metadata\")\n", "cs = load_dataset(\"tahoebio/Tahoe-100M\", \"cell_line_metadata\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | drug | \n", "targets | \n", "moa-broad | \n", "moa-fine | \n", "human-approved | \n", "clinical-trials | \n", "gpt-notes-approval | \n", "canonical_smiles | \n", "pubchem_cid | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "Talc | \n", "None | \n", "unclear | \n", "unclear | \n", "yes | \n", "yes | \n", "Talc used in pharma and cosmetics; safety unde... | \n", "[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O... | \n", "165411828.0 | \n", "
1 | \n", "Bortezomib | \n", "PSMB5 | \n", "inhibitor/antagonist | \n", "Proteasome inhibitor | \n", "yes | \n", "yes | \n", "Approved for multiple myeloma and mantle cell ... | \n", "B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN... | \n", "387447.0 | \n", "
2 | \n", "Ixazomib | \n", "PSMB5 | \n", "inhibitor/antagonist | \n", "Proteasome inhibitor | \n", "yes | \n", "yes | \n", "Approved for multiple myeloma treatment. | \n", "B(C(CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)Cl)(O)O | \n", "25183872.0 | \n", "
3 | \n", "Ixazomib citrate | \n", "PSMB1, PSMB2, PSMB5 | \n", "inhibitor/antagonist | \n", "Proteasome inhibitor | \n", "yes | \n", "yes | \n", "Approved for multiple myeloma treatment as par... | \n", "B1(OC(=O)C(O1)(CC(=O)O)CC(=O)O)C(CC(C)C)NC(=O)... | \n", "56844015.0 | \n", "
4 | \n", "Lactate (calcium) | \n", "None | \n", "unclear | \n", "unclear | \n", "yes | \n", "yes | \n", "Used in medical settings, but not specifically... | \n", "C.CC(C(=O)[O-])O.[Ca+2] | \n", "168311648.0 | \n", "
\n", " | nct_id | \n", "title | \n", "status | \n", "phase | \n", "conditions | \n", "interventions | \n", "pubchem_id | \n", "drug_name | \n", "standardized_condition | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "NCT04010565 | \n", "Effect of a Black Garlic Extract on Cholestero... | \n", "COMPLETED | \n", "NaN | \n", "Cardiovascular Diseases | \n", "Aged black garlic extract (DIETARY_SUPPLEMENT)... | \n", "165411828 | \n", "STERITALC | \n", "Cardiovascular Diseases | \n", "
1 | \n", "NCT00565266 | \n", "Asthma Clinical Research Network (ACRN) Trial ... | \n", "COMPLETED | \n", "PHASE3 | \n", "Asthma | \n", "tiotropium bromide (DRUG); salmeterol xinafoat... | \n", "165411828 | \n", "STERITALC | \n", "Asthma | \n", "
2 | \n", "NCT04401579 | \n", "Adaptive COVID-19 Treatment Trial 2 (ACTT-2) | \n", "COMPLETED | \n", "PHASE3 | \n", "COVID-19 | \n", "Placebo (OTHER); Remdesivir (DRUG); Baricitini... | \n", "165411828 | \n", "STERITALC | \n", "COVID-19 | \n", "
3 | \n", "NCT05576038 | \n", "Tryptophan for Impaired AhR Signaling in Celia... | \n", "RECRUITING | \n", "NaN | \n", "Tryptophan Metabolism Alterations; Celiac Disease | \n", "L-Tryptophan (DRUG); Freedom SimpleCap Powder ... | \n", "165411828 | \n", "STERITALC | \n", "Tryptophan Metabolism Alterations; Celiac Disease | \n", "
4 | \n", "NCT06744946 | \n", "Thoracoscopic Talc Pleurodesis Versus Bleomyci... | \n", "COMPLETED | \n", "PHASE4 | \n", "Malignant Pleural Effusions (Mpe)- Pleurodesis... | \n", "thoracoscope (PROCEDURE) | \n", "165411828 | \n", "STERITALC | \n", "Malignant Pleural Effusions (Mpe)- Pleurodesis... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
58800 | \n", "NCT03387709 | \n", "UCSD Get Fit, Be Fit Study | \n", "COMPLETED | \n", "NaN | \n", "Overweight and Obesity | \n", "Pistachio-enriched diet (BEHAVIORAL); General ... | \n", "5282164 | \n", "OZ | \n", "Overweight and Obesity | \n", "
58801 | \n", "NCT01084109 | \n", "First Bites: Complementary Feeding - A Global ... | \n", "COMPLETED | \n", "PHASE3 | \n", "Complementary Feeding; Infant Growth | \n", "Lyophilized meat (OTHER); Cereal (OTHER) | \n", "5282164 | \n", "OZ | \n", "Complementary Feeding; Infant Growth | \n", "
58802 | \n", "NCT04987307 | \n", "Safety and Efficacy of Efavaleukin Alfa in Par... | \n", "TERMINATED | \n", "PHASE2 | \n", "Ulcerative Colitis | \n", "Efavaleukin alfa (DRUG); Placebo (DRUG) | \n", "5282164 | \n", "OZ | \n", "Ulcerative Colitis | \n", "
58803 | \n", "NCT03990909 | \n", "Pilot Study of BCAA on Sleep | \n", "RECRUITING | \n", "NaN | \n", "Sleep Disorder; Traumatic Brain Injury | \n", "Branched Chain Amino Acids (DIETARY_SUPPLEMENT... | \n", "5282164 | \n", "OZ | \n", "Sleep Disorder; Traumatic Brain Injury | \n", "
58804 | \n", "NCT04000685 | \n", "The Effects of Different Exercise Approaches i... | \n", "COMPLETED | \n", "NaN | \n", "Chronic Low-back Pain | \n", "Yoga exercise program (OTHER); Spinal stabiliz... | \n", "5282164 | \n", "OZ | \n", "Chronic Low-back Pain | \n", "
58805 rows × 9 columns
\n", "\n", " | nct_id | \n", "title | \n", "status | \n", "phase | \n", "conditions | \n", "interventions | \n", "pubchem_id | \n", "drug_name | \n", "standardized_condition | \n", "matched_organ | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "NCT04010565 | \n", "Effect of a Black Garlic Extract on Cholestero... | \n", "COMPLETED | \n", "NaN | \n", "Cardiovascular Diseases | \n", "Aged black garlic extract (DIETARY_SUPPLEMENT)... | \n", "165411828 | \n", "STERITALC | \n", "Cardiovascular Diseases | \n", "None | \n", "
1 | \n", "NCT00565266 | \n", "Asthma Clinical Research Network (ACRN) Trial ... | \n", "COMPLETED | \n", "PHASE3 | \n", "Asthma | \n", "tiotropium bromide (DRUG); salmeterol xinafoat... | \n", "165411828 | \n", "STERITALC | \n", "Asthma | \n", "Lung | \n", "
2 | \n", "NCT04401579 | \n", "Adaptive COVID-19 Treatment Trial 2 (ACTT-2) | \n", "COMPLETED | \n", "PHASE3 | \n", "COVID-19 | \n", "Placebo (OTHER); Remdesivir (DRUG); Baricitini... | \n", "165411828 | \n", "STERITALC | \n", "COVID-19 | \n", "None | \n", "
3 | \n", "NCT05576038 | \n", "Tryptophan for Impaired AhR Signaling in Celia... | \n", "RECRUITING | \n", "NaN | \n", "Tryptophan Metabolism Alterations; Celiac Disease | \n", "L-Tryptophan (DRUG); Freedom SimpleCap Powder ... | \n", "165411828 | \n", "STERITALC | \n", "Tryptophan Metabolism Alterations; Celiac Disease | \n", "None | \n", "
4 | \n", "NCT06744946 | \n", "Thoracoscopic Talc Pleurodesis Versus Bleomyci... | \n", "COMPLETED | \n", "PHASE4 | \n", "Malignant Pleural Effusions (Mpe)- Pleurodesis... | \n", "thoracoscope (PROCEDURE) | \n", "165411828 | \n", "STERITALC | \n", "Malignant Pleural Effusions (Mpe)- Pleurodesis... | \n", "Lung | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
58800 | \n", "NCT03387709 | \n", "UCSD Get Fit, Be Fit Study | \n", "COMPLETED | \n", "NaN | \n", "Overweight and Obesity | \n", "Pistachio-enriched diet (BEHAVIORAL); General ... | \n", "5282164 | \n", "OZ | \n", "Overweight and Obesity | \n", "None | \n", "
58801 | \n", "NCT01084109 | \n", "First Bites: Complementary Feeding - A Global ... | \n", "COMPLETED | \n", "PHASE3 | \n", "Complementary Feeding; Infant Growth | \n", "Lyophilized meat (OTHER); Cereal (OTHER) | \n", "5282164 | \n", "OZ | \n", "Complementary Feeding; Infant Growth | \n", "None | \n", "
58802 | \n", "NCT04987307 | \n", "Safety and Efficacy of Efavaleukin Alfa in Par... | \n", "TERMINATED | \n", "PHASE2 | \n", "Ulcerative Colitis | \n", "Efavaleukin alfa (DRUG); Placebo (DRUG) | \n", "5282164 | \n", "OZ | \n", "Ulcerative Colitis | \n", "Bowel | \n", "
58803 | \n", "NCT03990909 | \n", "Pilot Study of BCAA on Sleep | \n", "RECRUITING | \n", "NaN | \n", "Sleep Disorder; Traumatic Brain Injury | \n", "Branched Chain Amino Acids (DIETARY_SUPPLEMENT... | \n", "5282164 | \n", "OZ | \n", "Sleep Disorder; Traumatic Brain Injury | \n", "CNS/Brain | \n", "
58804 | \n", "NCT04000685 | \n", "The Effects of Different Exercise Approaches i... | \n", "COMPLETED | \n", "NaN | \n", "Chronic Low-back Pain | \n", "Yoga exercise program (OTHER); Spinal stabiliz... | \n", "5282164 | \n", "OZ | \n", "Chronic Low-back Pain | \n", "None | \n", "
58805 rows × 10 columns
\n", "\n", " | nct_id | \n", "title | \n", "status | \n", "phase | \n", "conditions | \n", "interventions | \n", "pubchem_id | \n", "drug_name | \n", "standardized_condition | \n", "matched_organ | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "NCT04010565 | \n", "Effect of a Black Garlic Extract on Cholestero... | \n", "COMPLETED | \n", "NaN | \n", "Cardiovascular Diseases | \n", "Aged black garlic extract (DIETARY_SUPPLEMENT)... | \n", "165411828 | \n", "STERITALC | \n", "Cardiovascular Diseases | \n", "None | \n", "
1 | \n", "NCT00565266 | \n", "Asthma Clinical Research Network (ACRN) Trial ... | \n", "COMPLETED | \n", "PHASE3 | \n", "Asthma | \n", "tiotropium bromide (DRUG); salmeterol xinafoat... | \n", "165411828 | \n", "STERITALC | \n", "Asthma | \n", "Lung | \n", "
2 | \n", "NCT04401579 | \n", "Adaptive COVID-19 Treatment Trial 2 (ACTT-2) | \n", "COMPLETED | \n", "PHASE3 | \n", "COVID-19 | \n", "Placebo (OTHER); Remdesivir (DRUG); Baricitini... | \n", "165411828 | \n", "STERITALC | \n", "COVID-19 | \n", "None | \n", "
3 | \n", "NCT05576038 | \n", "Tryptophan for Impaired AhR Signaling in Celia... | \n", "RECRUITING | \n", "NaN | \n", "Tryptophan Metabolism Alterations; Celiac Disease | \n", "L-Tryptophan (DRUG); Freedom SimpleCap Powder ... | \n", "165411828 | \n", "STERITALC | \n", "Tryptophan Metabolism Alterations; Celiac Disease | \n", "None | \n", "
4 | \n", "NCT06744946 | \n", "Thoracoscopic Talc Pleurodesis Versus Bleomyci... | \n", "COMPLETED | \n", "PHASE4 | \n", "Malignant Pleural Effusions (Mpe)- Pleurodesis... | \n", "thoracoscope (PROCEDURE) | \n", "165411828 | \n", "STERITALC | \n", "Malignant Pleural Effusions (Mpe)- Pleurodesis... | \n", "Lung | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
58800 | \n", "NCT03387709 | \n", "UCSD Get Fit, Be Fit Study | \n", "COMPLETED | \n", "NaN | \n", "Overweight and Obesity | \n", "Pistachio-enriched diet (BEHAVIORAL); General ... | \n", "5282164 | \n", "OZ | \n", "Overweight and Obesity | \n", "None | \n", "
58801 | \n", "NCT01084109 | \n", "First Bites: Complementary Feeding - A Global ... | \n", "COMPLETED | \n", "PHASE3 | \n", "Complementary Feeding; Infant Growth | \n", "Lyophilized meat (OTHER); Cereal (OTHER) | \n", "5282164 | \n", "OZ | \n", "Complementary Feeding; Infant Growth | \n", "None | \n", "
58802 | \n", "NCT04987307 | \n", "Safety and Efficacy of Efavaleukin Alfa in Par... | \n", "TERMINATED | \n", "PHASE2 | \n", "Ulcerative Colitis | \n", "Efavaleukin alfa (DRUG); Placebo (DRUG) | \n", "5282164 | \n", "OZ | \n", "Ulcerative Colitis | \n", "Bowel | \n", "
58803 | \n", "NCT03990909 | \n", "Pilot Study of BCAA on Sleep | \n", "RECRUITING | \n", "NaN | \n", "Sleep Disorder; Traumatic Brain Injury | \n", "Branched Chain Amino Acids (DIETARY_SUPPLEMENT... | \n", "5282164 | \n", "OZ | \n", "Sleep Disorder; Traumatic Brain Injury | \n", "CNS/Brain | \n", "
58804 | \n", "NCT04000685 | \n", "The Effects of Different Exercise Approaches i... | \n", "COMPLETED | \n", "NaN | \n", "Chronic Low-back Pain | \n", "Yoga exercise program (OTHER); Spinal stabiliz... | \n", "5282164 | \n", "OZ | \n", "Chronic Low-back Pain | \n", "None | \n", "
58805 rows × 10 columns
\n", "matched_organ | \n", "Bladder/Urinary Tract | \n", "Bowel | \n", "Breast | \n", "CNS/Brain | \n", "Cervix | \n", "Esophagus/Stomach | \n", "Kidney | \n", "Liver | \n", "Lung | \n", "Ovary/Fallopian Tube | \n", "Pancreas | \n", "Peripheral Nervous System | \n", "Skin | \n", "Uterus | \n", "Total | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pubchem_id | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
5281004 | \n", "0.0 | \n", "6.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "3.0 | \n", "1.0 | \n", "0.0 | \n", "72.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "82.0 | \n", "
16923 | \n", "0.0 | \n", "3.0 | \n", "0.0 | \n", "7.0 | \n", "0.0 | \n", "0.0 | \n", "12.0 | \n", "4.0 | \n", "11.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "39.0 | \n", "
5284373 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "23.0 | \n", "4.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "3.0 | \n", "0.0 | \n", "34.0 | \n", "
107969 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "31.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "31.0 | \n", "
24906252 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "24.0 | \n", "3.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "29.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
11595577 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
11957481 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
72172 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
16052011 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
5282164 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
544 rows × 15 columns
\n", "