|
import ast |
|
import pandas as pd |
|
from datasets import load_dataset |
|
|
|
|
|
edist = pd.read_table("Tahoe96M_edist_matrix_within_plate.tsv", sep="\t") |
|
|
|
|
|
|
|
edist_long = edist.melt(id_vars=["cell_line"], var_name="drug_info", value_name="e_distance") |
|
|
|
edist_long["drug_info"] = edist_long["drug_info"].apply(lambda x: ast.literal_eval(x)[0]) |
|
|
|
edist_long[["drug_name", "concentration", "unit"]] = pd.DataFrame( |
|
edist_long["drug_info"].tolist(), index=edist_long.index |
|
) |
|
|
|
|
|
|
|
ds = load_dataset("tahoebio/Tahoe-100M", "drug_metadata") |
|
drug_metadata = ds["train"].to_pandas() |
|
drug_name = drug_metadata["drug"] |
|
|
|
|
|
cs = load_dataset("tahoebio/Tahoe-100M", "cell_line_metadata") |
|
cell_line_metadata = cs["train"].to_pandas() |
|
cell_line = cell_line_metadata["Cell_ID_Cellosaur"] |
|
|
|
|
|
|
|
edist_long_drug_metadata = edist_long.merge(drug_metadata, left_on="drug_name", right_on="drug", how="inner") |
|
edist_long_drug_metadata.shape |
|
|
|
|
|
top_edist_per_drug_cell_line_combo = ( |
|
edist_long_drug_metadata.sort_values("e_distance", ascending=False) |
|
.drop_duplicates(subset=["drug_name", "cell_line"]) |
|
.reset_index(drop=True) |
|
) |
|
top_edist_per_drug_cell_line_combo.to_csv("tahoe_largest_edist_per_drug_cell_line_combo.csv", index=False) |
|
|