{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "7c6c914c", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from scipy.stats import spearmanr\n", "import seaborn as sns\n", "from sklearn.linear_model import Ridge\n", "from sklearn.model_selection import train_test_split\n", "import torch\n", "from tqdm.auto import tqdm\n", "from transformers import AutoModelForCausalLM, AutoTokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "00cfd012", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
antibody_idantibody_nameTiterPuritySEC %MonomerSMACHICHACPR_CHOPR_Ova...hc_protein_sequencehc_dna_sequencevl_protein_sequencelc_protein_sequencelc_dna_sequencehierarchical_cluster_foldrandom_foldhierarchical_cluster_IgG_isotype_stratified_foldlight_aligned_ahoheavy_aligned_aho
0GDPa1-001abagovomab140.2598.53097.0102.7302.590NaN0.3378370.263108...MRAWIFFLLCLAGRALAQVKLQESGAELARPGASVKLSCKASGYTF...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...MRAWIFFLLCLAGRALADIELTQSPASLSASVGETVTITCQASENI...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...122DIELTQSPASLSASVGETVTITCQAS--ENIY------SYLAWHQQ...QVKLQES-GAELARPGASVKLSCKASG-YTFTN-----YWMQWVKQ...
1GDPa1-002abituzumab193.3199.82597.6202.7452.5453.6900.2052460.100155...MRAWIFFLLCLAGRALAQVQLQQSGGELAKPGASVKVSCKASGYTF...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQDI...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...140DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------NYLAWYQQ...QVQLQQS-GGELAKPGASVKVSCKASG-YTFSS-----FWMHWVRQ...
2GDPa1-003abrezekimab114.7598.35089.0552.7402.705NaN0.1387730.101180...MRAWIFFLLCLAGRALAQVTLKESGPVLVKPTETLTLTCTVSGFSL...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...DIQMTQSPSSLSASVGDRVTITCLASEDISNYLAWYQQKPGKAPKL...MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCLASEDI...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...222DIQMTQSPSSLSASVGDRVTITCLAS--EDIS------NYLAWYQQ...QVTLKES-GPVLVKPTETLTLTCTVSG-FSLTN-----YHVQWIRQ...
3GDPa1-004abrilumab327.3298.57598.6052.7152.5651.0050.0000000.054971...MRAWIFFLLCLAGRALAQVQLVQSGAEVKKPGASVKVSCKVSGYTL...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...MRAWIFFLLCLAGRALADIQMTQSPSSVSASVGDRVTITCRASQGI...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...130DIQMTQSPSSVSASVGDRVTITCRAS--QGIS------SWLAWYQQ...QVQLVQS-GAEVKKPGASVKVSCKVSG-YTLSD-----LSIHWVRQ...
4GDPa1-005adalimumab313.3999.30096.1202.7052.495NaN0.1833870.085628...MRAWIFFLLCLAGRALAEVQLVESGGGLVQPGRSLRLSCAASGFTF...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQGI...GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG...020DIQMTQSPSSLSASVGDRVTITCRAS--QGIR------NYLAWYQQ...EVQLVES-GGGLVQPGRSLRLSCAASG-FTFDD-----YAMHWVRQ...
\n", "

5 rows × 30 columns

\n", "
" ], "text/plain": [ " antibody_id antibody_name Titer Purity SEC %Monomer SMAC HIC \\\n", "0 GDPa1-001 abagovomab 140.25 98.530 97.010 2.730 2.590 \n", "1 GDPa1-002 abituzumab 193.31 99.825 97.620 2.745 2.545 \n", "2 GDPa1-003 abrezekimab 114.75 98.350 89.055 2.740 2.705 \n", "3 GDPa1-004 abrilumab 327.32 98.575 98.605 2.715 2.565 \n", "4 GDPa1-005 adalimumab 313.39 99.300 96.120 2.705 2.495 \n", "\n", " HAC PR_CHO PR_Ova ... \\\n", "0 NaN 0.337837 0.263108 ... \n", "1 3.690 0.205246 0.100155 ... \n", "2 NaN 0.138773 0.101180 ... \n", "3 1.005 0.000000 0.054971 ... \n", "4 NaN 0.183387 0.085628 ... \n", "\n", " hc_protein_sequence \\\n", "0 MRAWIFFLLCLAGRALAQVKLQESGAELARPGASVKLSCKASGYTF... \n", "1 MRAWIFFLLCLAGRALAQVQLQQSGGELAKPGASVKVSCKASGYTF... \n", "2 MRAWIFFLLCLAGRALAQVTLKESGPVLVKPTETLTLTCTVSGFSL... \n", "3 MRAWIFFLLCLAGRALAQVQLVQSGAEVKKPGASVKVSCKVSGYTL... \n", "4 MRAWIFFLLCLAGRALAEVQLVESGGGLVQPGRSLRLSCAASGFTF... \n", "\n", " hc_dna_sequence \\\n", "0 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "1 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "2 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "3 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "4 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "\n", " vl_protein_sequence \\\n", "0 DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL... \n", "1 DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL... \n", "2 DIQMTQSPSSLSASVGDRVTITCLASEDISNYLAWYQQKPGKAPKL... \n", "3 DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL... \n", "4 DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL... \n", "\n", " lc_protein_sequence \\\n", "0 MRAWIFFLLCLAGRALADIELTQSPASLSASVGETVTITCQASENI... \n", "1 MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQDI... \n", "2 MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCLASEDI... \n", "3 MRAWIFFLLCLAGRALADIQMTQSPSSVSASVGDRVTITCRASQGI... \n", "4 MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQGI... \n", "\n", " lc_dna_sequence \\\n", "0 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "1 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "2 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "3 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "4 GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... \n", "\n", " hierarchical_cluster_fold random_fold \\\n", "0 1 2 \n", "1 1 4 \n", "2 2 2 \n", "3 1 3 \n", "4 0 2 \n", "\n", " hierarchical_cluster_IgG_isotype_stratified_fold \\\n", "0 2 \n", "1 0 \n", "2 2 \n", "3 0 \n", "4 0 \n", "\n", " light_aligned_aho \\\n", "0 DIELTQSPASLSASVGETVTITCQAS--ENIY------SYLAWHQQ... \n", "1 DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------NYLAWYQQ... \n", "2 DIQMTQSPSSLSASVGDRVTITCLAS--EDIS------NYLAWYQQ... \n", "3 DIQMTQSPSSVSASVGDRVTITCRAS--QGIS------SWLAWYQQ... \n", "4 DIQMTQSPSSLSASVGDRVTITCRAS--QGIR------NYLAWYQQ... \n", "\n", " heavy_aligned_aho \n", "0 QVKLQES-GAELARPGASVKLSCKASG-YTFTN-----YWMQWVKQ... \n", "1 QVQLQQS-GGELAKPGASVKVSCKASG-YTFSS-----FWMHWVRQ... \n", "2 QVTLKES-GPVLVKPTETLTLTCTVSG-FSLTN-----YHVQWIRQ... \n", "3 QVQLVQS-GAEVKKPGASVKVSCKVSG-YTLSD-----LSIHWVRQ... \n", "4 EVQLVES-GGGLVQPGRSLRLSCAASG-FTFDD-----YAMHWVRQ... \n", "\n", "[5 rows x 30 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_name = \"ollieturnbull/p-IgGen\"\n", "df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n", "\n", "# Example: Just predict HIC, so we'll drop NaN rows for that\n", "df = df.dropna(subset=[\"HIC\"])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "f6da015f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1Q V K L Q E S G A E L A R P G A S V K L S C K A S G Y T F T N Y W M Q W V K Q R P G Q G L D W I G A I Y P G D G N T R Y T H K F K G K A T L T A D K S S S T A Y M Q L S S L A S E D S G V Y Y C A R G E G N Y A W F A Y W G Q G T T V T V S SD I E L T Q S P A S L S A S V G E T V T I T C Q A S E N I Y S Y L A W H Q Q K Q G K S P Q L L V Y N A K T L A G G V S S R F S G S G S G T H F S L K I K S L Q P E D F G I Y Y C Q H H Y G I L P T F G G G T K L E I K2\n" ] } ], "source": [ "# Tokenize the sequences\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "# Paired sequence handling: Concatenate heavy and light chains and add beginning (\"1\") and end (\"2\") tokens \n", "# (e.g. [\"EVQLV...\", \"DIQMT...\"] -> \"1E V Q L V ... D I Q M T ... 2\")\n", "sequences = [\n", " \"1\" + \" \".join(heavy) + \" \".join(light) + \"2\"\n", " for heavy, light in zip(\n", " df[\"vh_protein_sequence\"],\n", " df[\"vl_protein_sequence\"],\n", " )\n", "]\n", "\n", "print(sequences[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "afeb8db8", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "48c1bb6d281f476abd0156e2cf5ef1e4", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/8 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.scatterplot(x=y_test[:, 0], y=y_pred[:, 0])\n", "plt.title(f\"Scatter plot of predicted vs. true Hydrophobicity\\nSpearman's rho: {spearmanr(y_pred, y_test)[0]:.2f}\")\n", "plt.xlabel(\"True Hydrophobicity\")\n", "plt.ylabel(\"Predicted Hydrophobicity\")\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "6f346b98", "metadata": {}, "source": [ "## Cross-validation" ] }, { "cell_type": "code", "execution_count": null, "id": "6f395093", "metadata": {}, "outputs": [], "source": [ "# TODO same as above but using hierarchical_cluster_IgG_isotype_stratified_fold" ] } ], "metadata": { "kernelspec": { "display_name": "mlbase", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }