{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "7c6c914c", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from scipy.stats import spearmanr\n", "import seaborn as sns\n", "from sklearn.linear_model import Ridge\n", "from sklearn.model_selection import train_test_split\n", "import torch\n", "from tqdm.auto import tqdm\n", "from transformers import AutoModelForCausalLM, AutoTokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "00cfd012", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | antibody_id | \n", "antibody_name | \n", "Titer | \n", "Purity | \n", "SEC %Monomer | \n", "SMAC | \n", "HIC | \n", "HAC | \n", "PR_CHO | \n", "PR_Ova | \n", "... | \n", "hc_protein_sequence | \n", "hc_dna_sequence | \n", "vl_protein_sequence | \n", "lc_protein_sequence | \n", "lc_dna_sequence | \n", "hierarchical_cluster_fold | \n", "random_fold | \n", "hierarchical_cluster_IgG_isotype_stratified_fold | \n", "light_aligned_aho | \n", "heavy_aligned_aho | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "GDPa1-001 | \n", "abagovomab | \n", "140.25 | \n", "98.530 | \n", "97.010 | \n", "2.730 | \n", "2.590 | \n", "NaN | \n", "0.337837 | \n", "0.263108 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVKLQESGAELARPGASVKLSCKASGYTF... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL... | \n", "MRAWIFFLLCLAGRALADIELTQSPASLSASVGETVTITCQASENI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "1 | \n", "2 | \n", "2 | \n", "DIELTQSPASLSASVGETVTITCQAS--ENIY------SYLAWHQQ... | \n", "QVKLQES-GAELARPGASVKLSCKASG-YTFTN-----YWMQWVKQ... | \n", "
1 | \n", "GDPa1-002 | \n", "abituzumab | \n", "193.31 | \n", "99.825 | \n", "97.620 | \n", "2.745 | \n", "2.545 | \n", "3.690 | \n", "0.205246 | \n", "0.100155 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVQLQQSGGELAKPGASVKVSCKASGYTF... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQDI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "1 | \n", "4 | \n", "0 | \n", "DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------NYLAWYQQ... | \n", "QVQLQQS-GGELAKPGASVKVSCKASG-YTFSS-----FWMHWVRQ... | \n", "
2 | \n", "GDPa1-003 | \n", "abrezekimab | \n", "114.75 | \n", "98.350 | \n", "89.055 | \n", "2.740 | \n", "2.705 | \n", "NaN | \n", "0.138773 | \n", "0.101180 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVTLKESGPVLVKPTETLTLTCTVSGFSL... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSLSASVGDRVTITCLASEDISNYLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCLASEDI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "2 | \n", "2 | \n", "2 | \n", "DIQMTQSPSSLSASVGDRVTITCLAS--EDIS------NYLAWYQQ... | \n", "QVTLKES-GPVLVKPTETLTLTCTVSG-FSLTN-----YHVQWIRQ... | \n", "
3 | \n", "GDPa1-004 | \n", "abrilumab | \n", "327.32 | \n", "98.575 | \n", "98.605 | \n", "2.715 | \n", "2.565 | \n", "1.005 | \n", "0.000000 | \n", "0.054971 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVQLVQSGAEVKKPGASVKVSCKVSGYTL... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSVSASVGDRVTITCRASQGI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "1 | \n", "3 | \n", "0 | \n", "DIQMTQSPSSVSASVGDRVTITCRAS--QGIS------SWLAWYQQ... | \n", "QVQLVQS-GAEVKKPGASVKVSCKVSG-YTLSD-----LSIHWVRQ... | \n", "
4 | \n", "GDPa1-005 | \n", "adalimumab | \n", "313.39 | \n", "99.300 | \n", "96.120 | \n", "2.705 | \n", "2.495 | \n", "NaN | \n", "0.183387 | \n", "0.085628 | \n", "... | \n", "MRAWIFFLLCLAGRALAEVQLVESGGGLVQPGRSLRLSCAASGFTF... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQGI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "0 | \n", "2 | \n", "0 | \n", "DIQMTQSPSSLSASVGDRVTITCRAS--QGIR------NYLAWYQQ... | \n", "EVQLVES-GGGLVQPGRSLRLSCAASG-FTFDD-----YAMHWVRQ... | \n", "
5 rows × 30 columns
\n", "