Spaces:

jinysun
/

BiBERTa

Sleeping

App Files Files Community

jinysun commited on Nov 23, 2023

Commit

ecdea35

1 Parent(s): dabfe71

Upload 17 files

Browse files

Files changed (18) hide show

.gitattributes +2 -0
app.py +27 -16
config/config_hparam.json +26 -0
config/predict.json +26 -0
util/__pycache__/attention_flow.cpython-38.pyc +0 -0
util/__pycache__/emetric.cpython-38.pyc +0 -0
util/__pycache__/regression_metric.cpython-38.pyc +0 -0
util/__pycache__/stream.cpython-38.pyc +0 -0
util/__pycache__/utils.cpython-38.pyc +0 -0
util/attention_flow.py +195 -0
util/attention_plot.py +93 -0
util/boxplot.py +201 -0
util/data/bindingdb_kd.tab +3 -0
util/data/davis.tab +3 -0
util/emetric.py +59 -0
util/load_dataset.py +32 -0
util/make_external_validation.py +28 -0
util/utils.py +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+util/data/bindingdb_kd.tab filter=lfs diff=lfs merge=lfs -text
+util/data/davis.tab filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -13,12 +13,16 @@ st.title("🔋DeepDAP")
 url1= r"https://docs.google.com/spreadsheets/d/1AKkZS04VF3osFT36aNHIb4iUbV8D1uNfsldcpHXogj0/gviz/tq?tqx=out:csv&sheet=dap"
 df1 = pd.read_csv(url1, dtype=str, encoding='utf-8')
-text_search = st.text_input("🔍Search papers or molecules", value="")
-m1 = df1["Donor_Name"].str.contains(text_search)
-m2 = df1["reference"].str.contains(text_search)
-m3 = df1["Acceptor_Name"].str.contains(text_search)
-df_search = df1[m1 | m2|m3]
 if text_search:
     st.write(df_search)
     st.download_button( "⬇️Download edited files as .csv", df_search.to_csv(), "df_search.csv", use_container_width=True)
@@ -28,16 +32,23 @@ st.download_button(
     "⬇️ Download edited files as .csv", edited_df.to_csv(), "edited_df.csv", use_container_width=True
 )
-molecule = st.text_input("👨‍🔬Molecule")
-smile_code = st_ketcher(molecule)
-st.markdown("🏆New SMILES of edited molecules: {smile_code }")
-acceptor=  st.text_input("🎈SMILES  of acceptor")
-donor =  st.text_input("🎈SMILES  of donor")
 try:
     pce = run.smiles_aas_test( str(acceptor ), str(donor) )
-    st.markdown("⚡PCE: ``{pce}``")
 except:
-    st.markdown("⚡PCE:  None  ")

 url1= r"https://docs.google.com/spreadsheets/d/1AKkZS04VF3osFT36aNHIb4iUbV8D1uNfsldcpHXogj0/gviz/tq?tqx=out:csv&sheet=dap"
 df1 = pd.read_csv(url1, dtype=str, encoding='utf-8')
+col1, col2 = st.columns(2)
+with col1:
+	text_search = st.text_input("🔍Search papers or molecules", value="")
+	m1 = df1["Donor_Name"].str.contains(text_search)
+	m2 = df1["reference"].str.contains(text_search)
+	m3 = df1["Acceptor_Name"].str.contains(text_search)
+	df_search = df1[m1 | m2|m3]
+with col2:
+	st.link_button("📝Database",  r"https://docs.google.com/spreadsheets/d/1AKkZS04VF3osFT36aNHIb4iUbV8D1uNfsldcpHXogj0")
+	st.caption('🎉If you want to update the database, click the button.')
 if text_search:
     st.write(df_search)
     st.download_button( "⬇️Download edited files as .csv", df_search.to_csv(), "df_search.csv", use_container_width=True)
     "⬇️ Download edited files as .csv", edited_df.to_csv(), "edited_df.csv", use_container_width=True
 )
+option = st.selectbox(
+   "How would you like to be contacted?",
+   ("Donor", "Acceptor"), placeholder="Select the type of active layer..."
+)
+if option == 'Acceptor':
+	molecule = st.text_input("👨‍🔬Acceptor Molecule" )
+	acceptor= st_ketcher(molecule )
+	st.markdown(f"🏆New SMILES of edited acceptor molecules: {acceptor}")
+	donor= st.text_input("📋 Donor Molecule")
+if option =='Donor':
+	do= st.text_input("👨‍🔬Donor Molecule" )
+	donor = st_ketcher(do)
+	st.markdown(f"🏆New SMILES of edited donor molecules: {donor}")
+	acceptor = st.text_input("📋 Acceptor Molecule")
 try:
     pce = run.smiles_aas_test( str(acceptor ), str(donor) )
+    st.markdown(f"⚡PCE: ``{pce}``")
 except:
+    st.markdown(f"⚡PCE:  None  ")

config/config_hparam.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{	"name": "biomarker_log",
+	"d_model_name" : "DeepChem/ChemBERTa-10M-MTR",
+	"p_model_name" : "DeepChem/ChemBERTa-77M-MLM",
+	"gpu_ids" : "0",
+	"model_mode" : "train",
+	"load_checkpoint" : "./checkpoint/bindingDB/test.ckpt",
+	"prot_maxlength" : 360,
+	"layer_limit" : true,
+	"max_epoch": 16,
+	"batch_size": 40,
+	"num_workers": 0,
+	"task_name" : "OSC",
+	"lr": 1e-4,
+	"layer_features" : [512, 128, 64, 1],
+	"dropout" : 0.1,
+	"loss_fn" : "MSE",
+	"traindata_rate" : 1.0,
+	"pretrained": {"chem":true, "prot":true},
+	"num_seed" : 111
+}

config/predict.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{	"name": "biomarker_log",
+	"d_model_name" : "DeepChem/ChemBERTa-10M-MTR",
+	"p_model_name" : "DeepChem/ChemBERTa-77M-MLM",
+	"gpu_ids" : "0",
+	"model_mode" : "test",
+	"load_checkpoint" : "./OSC/test.ckpt",
+	"prot_maxlength" : 360,
+	"layer_limit" : true,
+	"max_epoch": 16,
+	"batch_size": 40,
+	"num_workers": 0,
+	"task_name" : "OSC",
+	"lr": 1e-4,
+	"layer_features" : [512, 128, 64, 1],
+	"dropout" : 0.1,
+	"loss_fn" : "MSE",
+	"traindata_rate" : 1.0,
+	"pretrained": {"chem":true, "prot":true},
+	"num_seed" : 111
+}

util/__pycache__/attention_flow.cpython-38.pyc ADDED Viewed

Binary file (6.07 kB). View file

util/__pycache__/emetric.cpython-38.pyc ADDED Viewed

Binary file (1.87 kB). View file

util/__pycache__/regression_metric.cpython-38.pyc ADDED Viewed

Binary file (1.88 kB). View file

util/__pycache__/stream.cpython-38.pyc ADDED Viewed

Binary file (2.96 kB). View file

util/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.6 kB). View file

util/attention_flow.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import networkx as nx
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+import itertools
+import matplotlib as mpl
+# import cugraph as cnx
+rc={'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10.0,
+    'axes.titlesize': 32, 'xtick.labelsize': 20, 'ytick.labelsize': 16}
+plt.rcParams.update(**rc)
+mpl.rcParams['axes.linewidth'] = .5 #set the value globally
+def plot_attention_heatmap(att, s_position, t_positions, input_tokens):
+    cls_att = np.flip(att[:,s_position, t_positions], axis=0)
+    xticklb = list(itertools.compress(input_tokens, [i in t_positions for i in np.arange(len(input_tokens))]))
+    yticklb = [str(i) if i%2 ==0 else '' for i in np.arange(att.shape[0],0, -1)]
+    ax = sns.heatmap(cls_att, xticklabels=xticklb, yticklabels=yticklb, cmap="YlOrRd")
+    return ax
+def convert_adjmat_tomats(adjmat, n_layers, l):
+    mats = np.zeros((n_layers,l,l))
+    for i in np.arange(n_layers):
+        mats[i] = adjmat[(i+1)*l:(i+2)*l,i*l:(i+1)*l]
+    return mats
+def make_residual_attention(attentions):
+    all_attention = [att.detach().cpu().numpy() for att in attentions]
+    attentions_mat = np.asarray(all_attention)[:,0]
+    res_att_mat = attentions_mat.sum(axis=1)/attentions_mat.shape[1]
+    res_att_mat = res_att_mat + np.eye(res_att_mat.shape[1])[None,...]
+    res_att_mat = res_att_mat / res_att_mat.sum(axis=-1)[...,None]
+    return attentions_mat, res_att_mat
+## -------------------------------------------------------- ##
+## -- Make flow network (No Print Node - edge Connection)-- ##
+## -------------------------------------------------------- ##
+def make_flow_network(mat, input_tokens):
+    n_layers, length, _ = mat.shape
+    adj_mat = np.zeros(((n_layers+1)*length, (n_layers+1)*length))
+    labels_to_index = {}
+    for k in np.arange(length):
+        labels_to_index[str(k)+"_"+input_tokens[k]] = k
+    for i in np.arange(1,n_layers+1):
+        for k_f in np.arange(length):
+            index_from = (i)*length+k_f
+            label = "L"+str(i)+"_"+str(k_f)
+            labels_to_index[label] = index_from
+            for k_t in np.arange(length):
+                index_to = (i-1)*length+k_t
+                adj_mat[index_from][index_to] = mat[i-1][k_f][k_t]
+    net_graph=nx.from_numpy_matrix(adj_mat, create_using=nx.DiGraph())
+    for i in np.arange(adj_mat.shape[0]):
+        for j in np.arange(adj_mat.shape[1]):
+            nx.set_edge_attributes(net_graph, {(i,j): adj_mat[i,j]}, 'capacity')
+    return net_graph, labels_to_index
+def make_input_node(attention_mat, res_labels_to_index):
+    input_nodes = []
+    for key in res_labels_to_index:
+        if res_labels_to_index[key] < attention_mat.shape[-1]:
+            input_nodes.append(key)
+    return input_nodes
+## ------------------------------------------------ ##
+## -- Draw Attention flow node - Edge Connection -- ##
+## ------------------------------------------------ ##
+##-- networkx graph Initation and Calculation flow --##
+def get_adjmat(mat, input_tokens):
+    n_layers, length, _ = mat.shape
+    adj_mat = np.zeros(((n_layers+1)*length, (n_layers+1)*length))
+    labels_to_index = {}
+    for k in np.arange(length):
+        labels_to_index[str(k)+"_"+input_tokens[k]] = k
+    for i in np.arange(1,n_layers+1):
+        for k_f in np.arange(length):
+            index_from = (i)*length+k_f
+            label = "L"+str(i)+"_"+str(k_f)
+            labels_to_index[label] = index_from
+            for k_t in np.arange(length):
+                index_to = (i-1)*length+k_t
+                adj_mat[index_from][index_to] = mat[i-1][k_f][k_t]
+    return adj_mat, labels_to_index
+def draw_attention_graph(adjmat, labels_to_index, n_layers, length):
+    A = adjmat
+    net_graph=nx.from_numpy_matrix(A, create_using=nx.DiGraph())
+    for i in np.arange(A.shape[0]):
+        for j in np.arange(A.shape[1]):
+            nx.set_edge_attributes(net_graph, {(i,j): A[i,j]}, 'capacity')
+    pos = {}
+    label_pos = {}
+    for i in np.arange(n_layers+1):
+        for k_f in np.arange(length):
+            pos[i*length+k_f] = ((i+0.4)*2, length - k_f)
+            label_pos[i*length+k_f] = (i*2, length - k_f)
+    index_to_labels = {}
+    for key in labels_to_index:
+        index_to_labels[labels_to_index[key]] = key.split("_")[-1]
+        if labels_to_index[key] >= length:
+            index_to_labels[labels_to_index[key]] = ''
+    #plt.figure(1,figsize=(20,12))
+    nx.draw_networkx_nodes(net_graph,pos,node_color='green', labels=index_to_labels, node_size=50)
+    nx.draw_networkx_labels(net_graph,pos=label_pos, labels=index_to_labels, font_size=18)
+    all_weights = []
+    #4 a. Iterate through the graph nodes to gather all the weights
+    for (node1,node2,data) in net_graph.edges(data=True):
+        all_weights.append(data['weight']) #we'll use this when determining edge thickness
+    #4 b. Get unique weights
+    unique_weights = list(set(all_weights))
+    #4 c. Plot the edges - one by one!
+    for weight in unique_weights:
+        #4 d. Form a filtered list with just the weight you want to draw
+        weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in net_graph.edges(data=True) if edge_attr['weight']==weight]
+        #4 e. I think multiplying by [num_nodes/sum(all_weights)] makes the graphs edges look cleaner
+        w = weight #(weight - min(all_weights))/(max(all_weights) - min(all_weights))
+        width = w
+        nx.draw_networkx_edges(net_graph,pos,edgelist=weighted_edges,width=width, edge_color='darkblue')
+    return net_graph
+def compute_flows(G, labels_to_index, input_nodes, length):
+    number_of_nodes = len(labels_to_index)
+    flow_values=np.zeros((number_of_nodes,number_of_nodes))
+    for key in tqdm(labels_to_index, desc="flow algorithms", total=len(labels_to_index)):
+        if key not in input_nodes:
+            current_layer = int(labels_to_index[key] / length)
+            pre_layer = current_layer - 1
+            u = labels_to_index[key]
+            for inp_node_key in input_nodes:
+                v = labels_to_index[inp_node_key]
+                flow_value = nx.maximum_flow_value(G,u,v, flow_func=nx.algorithms.flow.edmonds_karp)
+                # flow_value = cnx
+                flow_values[u][pre_layer*length+v ] = flow_value
+            flow_values[u] /= flow_values[u].sum()
+    return flow_values
+def compute_node_flow(G, labels_to_index, input_nodes, output_nodes,length):
+    number_of_nodes = len(labels_to_index)
+    flow_values=np.zeros((number_of_nodes,number_of_nodes))
+    for key in output_nodes:
+        if key not in input_nodes:
+            current_layer = int(labels_to_index[key] / length)
+            pre_layer = current_layer - 1
+            u = labels_to_index[key]
+            for inp_node_key in input_nodes:
+                v = labels_to_index[inp_node_key]
+                flow_value = nx.maximum_flow_value(G,u,v, flow_func=nx.algorithms.flow.edmonds_karp)
+                flow_values[u][pre_layer*length+v ] = flow_value
+            flow_values[u] /= flow_values[u].sum()
+    return flow_values
+def compute_joint_attention(att_mat, add_residual=True):
+    if add_residual:
+        residual_att = np.eye(att_mat.shape[1])[None,...]
+        aug_att_mat = att_mat + residual_att
+        aug_att_mat = aug_att_mat / aug_att_mat.sum(axis=-1)[...,None]
+    else:
+       aug_att_mat =  att_mat
+    joint_attentions = np.zeros(aug_att_mat.shape)
+    layers = joint_attentions.shape[0]
+    joint_attentions[0] = aug_att_mat[0]
+    for i in np.arange(1,layers):
+        joint_attentions[i] = aug_att_mat[i].dot(joint_attentions[i-1])
+    return joint_attentions

util/attention_plot.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+def make_attention_table(att, tokens, numb, token_idx = 0, layerNumb = -1):
+    token_att = att[layerNumb, token_idx, range(1, len(tokens))]
+    token_label=[]
+    token_numb=[]
+    for idx, token in enumerate(tokens[1:]) :
+        token_label.append(f"<b>{token}</b>")
+        token_numb.append(f"{idx}")
+    pair = list(zip(token_numb, token_att))
+    df = pd.DataFrame(pair, columns=["Amino acid", "Attention rate"])
+    df.to_csv(f"amino_acid_seq_attention_{numb}.csv", index=None)
+    top3_idx = sorted(range(len(token_att)), key=lambda i: token_att[i], reverse=True)[:3]
+    colors = ['cornflowerblue', ] * len(token_numb)
+    for i in top3_idx:
+       colors[i] = 'crimson'
+    fig = go.Figure(data=[go.Bar(
+        x=df["Amino acid"],
+        y=df["Attention rate"],
+       #  range_y=[min(token_att), max(token_att)],
+        marker_color=colors  # marker color can be a single color value or an iterable
+    )])
+#     fig = px.histogram(df, x="Amino acid", y="Attention rate", range_y=[min(token_att), max(token_att)])
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0)',mirror=False)
+    fig.update_yaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0.07)',mirror=False)
+    fig.update_layout(title={'text': "<b>Attention rate of amino acid sequence token</b>",
+                             'font':{'size':40},
+                             'y': 0.96,
+                             'x': 0.5,
+                             'xanchor': 'center',
+                             'yanchor': 'top'},
+                      xaxis=dict(tickmode='array',
+                                 tickvals=token_numb,
+                                 ticktext=token_label
+                                 ),
+                      xaxis_title={'text': "Amino acid sequence",
+                             'font':{'size':30}},
+                      yaxis_title={'text': "Attention rate",
+                             'font':{'size':30}},
+                      font=dict(family="Calibri, monospace",
+                                size=17
+                                ))
+    fig.write_image(f'figures/Amino_acid_seq_{numb}.png', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+def read_attention():
+    df = pd.read_csv("../amino_acid_seq_attention.csv")
+        # d_flow_values = np.asarray(d_read_flow_values)
+    fig = px.bar(df, x="Amino acid", y="Attention rate", range_y=[min(df["Attention rate"]), max(df["Attention rate"])])
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0)',mirror=False)
+    fig.update_yaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0.07)',mirror=False)
+    fig.update_layout(title={'text': "<b>Attention rate of amino acid sequence token</b>",
+                             'font':{'size':40},
+                             'y': 0.96,
+                             'x': 0.5,
+                             'xanchor': 'center',
+                             'yanchor': 'top'},
+                      xaxis_title={'text': "Amino acid sequence",
+                             'font':{'size':30}},
+                      yaxis_title={'text': "Attention rate",
+                             'font':{'size':30}},
+                      font=dict(family="Calibri, monospace",
+                                size=17
+                                ))
+    fig.write_image('figures/Amino_acid_seq.png', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+if __name__ == '__main__':
+    read_attention()

util/boxplot.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import pandas as pd
+import numpy as np
+from scipy import stats
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+ROC = 1
+PR = 2
+def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.03, text_height=1.03, color='black')):
+    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
+    Parameters:
+    ----------
+    fig: figure
+        plotly boxplot figure
+    array_columns: np.array
+        array of which columns to compare
+        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
+    subplot: None or int
+        specifies if the figures has subplots and what subplot to add the notation to
+    _format: dict
+        format characteristics for the lines
+    Returns:
+    -------
+    fig: figure
+        figure with the added notation
+    '''
+    # Specify in what y_range to plot for each pair of columns
+    y_range = np.zeros([len(array_columns), 2])
+    for i in range(len(array_columns)):
+        y_range[i] = [1.03+i*_format['interline'], 1.04+i*_format['interline']]
+    # Get values from figure
+    fig_dict = fig.to_dict()
+    # Get indices if working with subplots
+    if subplot:
+        if subplot == 1:
+            subplot_str = ''
+        else:
+            subplot_str =str(subplot)
+        indices = [] #Change the box index to the indices of the data for that subplot
+        for index, data in enumerate(fig_dict['data']):
+            #print(index, data['xaxis'], 'x' + subplot_str)
+            if data['xaxis'] == 'x' + subplot_str:
+                indices = np.append(indices, index)
+        indices = [int(i) for i in indices]
+        print((indices))
+    else:
+        subplot_str = ''
+    # Print the p-values
+    for index, column_pair in enumerate(array_columns):
+        if subplot:
+            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
+        else:
+            data_pair = column_pair
+        # Mare sure it is selecting the data and subplot you want
+        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
+        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])
+        # Get the p-value
+        pvalue = stats.ttest_ind(
+            fig_dict['data'][data_pair[0]]['y'],
+            fig_dict['data'][data_pair[1]]['y'],
+            equal_var=False,
+        )[1]
+        if pvalue >= 0.05:
+            symbol = 'ns'
+        elif pvalue >= 0.01:
+            symbol = '*'
+        elif pvalue >= 0.001:
+            symbol = '**'
+        else:
+            symbol = '***'
+        # Vertical line
+        fig.add_shape(type="line",
+            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
+            x0=column_pair[0], y0=y_range[index][0],
+            x1=column_pair[0], y1=y_range[index][1],
+            line=dict(color=_format['color'], width=1.5,)
+        )
+        # Horizontal line
+        fig.add_shape(type="line",
+            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
+            x0=column_pair[0], y0=y_range[index][1],
+            x1=column_pair[1], y1=y_range[index][1],
+            line=dict(color=_format['color'], width=1.5,)
+        )
+        # Vertical line
+        fig.add_shape(type="line",
+            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
+            x0=column_pair[1], y0=y_range[index][0],
+            x1=column_pair[1], y1=y_range[index][1],
+            line=dict(color=_format['color'], width=1.5,)
+        )
+        ## add text at the correct x, y coordinates
+        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
+        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
+            x=(column_pair[0] + column_pair[1])/2,
+            y=y_range[index][1]*_format['text_height'],
+            showarrow=False,
+            text=symbol,
+            textangle=0,
+            xref="x"+subplot_str,
+            yref="y"+subplot_str+" domain"
+        ))
+    return fig
+def box_plot(df):
+    fig = px.box(df, x = 'Task_name', y='test_auroc', color="Model")
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0)',mirror=False)
+    fig.update_yaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0.07)',mirror=False)
+    fig.update_layout(title={'text': "<b>ROC-AUC score distribution</b>",
+                             'font':{'size':40},
+                             'y': 0.96,
+                             'x': 0.5,
+                             'xanchor': 'center',
+                             'yanchor': 'top'},
+                      xaxis_title={'text': "Datasets",
+                             'font':{'size':30}},
+                      yaxis_title={'text': "ROC-AUC",
+                             'font':{'size':30}},
+                      font=dict(family="Calibri, monospace",
+                                size=17
+                                ))
+    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=1)
+    fig.write_image('../figures/box_plot_integration.png', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+def go_box_plot(df, metric = ROC):
+    dataset_list = ['BIOSNAP', 'DAVIS', 'BindingDB']
+    model_list = ['LR', 'DNN', 'GNN-CPI', 'DeepDTI', 'DeepDTA', 'DeepConv-DTI', 'Moltrans', 'ours']
+    clr_list = ['red', 'orange', 'green', 'indianred', 'lightseagreen', 'goldenrod', 'magenta', 'blue']
+    if metric == ROC:
+        # fig_title = "<b>ROC-AUC score distribution</b>"
+        file_title = "boxplot_auroc.png"
+        select_metric = "test_auroc"
+    else:
+        # fig_title = "<b>PR-AUC score distribution</b>"
+        file_title = "boxplot_auprc.png"
+        select_metric = "test_auprc"
+    fig = make_subplots(rows=1, cols=3, subplot_titles=[c for c in dataset_list])
+    groups = df.groupby(df.Task_name)
+    Legand = True
+    for dataset_idx, dataset in enumerate(dataset_list):
+            df_modelgroup = groups.get_group(dataset)
+            model_groups = df_modelgroup.groupby(df_modelgroup.Model)
+            if dataset_idx != 0:
+                    Legand = False
+            for model_idx, model in enumerate(model_list):
+                    df_data = model_groups.get_group(model)
+                    fig.append_trace(go.Box(y=df_data[select_metric],
+                                name=model,
+                                marker_color=clr_list[model_idx],
+                                showlegend = Legand
+                                ),
+                                row=1,
+                                col=dataset_idx+1)
+    # fig.update_layout(title={'text': fig_title,
+    #                         'font':{'size':25},
+    #                         'y': 0.98,
+    #                         'x': 0.46,
+    #                         'xanchor': 'center',
+    #                         'yanchor': 'top'})
+    #    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=1)
+    #    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=2)
+    #    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=3)
+    fig.write_image(f'../figures/{file_title}', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+if __name__ == '__main__':
+    df = pd.read_csv("../dataset/wandb_export_boxplotdata.csv")
+    box_plot(df)

util/data/bindingdb_kd.tab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72a38ae07a75d5d4c269d2776b6e62e0edde29ff7cf8a323158c08951f808d1
+size 54432102

util/data/davis.tab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4c6809dcb7c5da2b91a32d594d6935b75484940bde4d18055eb5e1059262f4
+size 21376712

util/emetric.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import numpy as np
+def get_cindex(Y, P):
+    summ = 0
+    pair = 0
+    for i in range(1, len(Y)):
+        for j in range(0, i):
+            if i is not j:
+                if(Y[i] > Y[j]):
+                    pair +=1
+                    summ +=  1* (P[i] > P[j]) + 0.5 * (P[i] == P[j])
+    if pair is not 0:
+        return summ/pair
+    else:
+        return 0
+def r_squared_error(y_obs,y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = [np.mean(y_obs) for y in y_obs]
+    y_pred_mean = [np.mean(y_pred) for y in y_pred]
+    mult = sum((y_pred - y_pred_mean) * (y_obs - y_obs_mean))
+    mult = mult * mult
+    y_obs_sq = sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
+    y_pred_sq = sum((y_pred - y_pred_mean) * (y_pred - y_pred_mean) )
+    return mult / float(y_obs_sq * y_pred_sq)
+def get_k(y_obs,y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    return sum(y_obs*y_pred) / float(sum(y_pred*y_pred))
+def squared_error_zero(y_obs,y_pred):
+    k = get_k(y_obs,y_pred)
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = [np.mean(y_obs) for y in y_obs]
+    upp = sum((y_obs - (k*y_pred)) * (y_obs - (k* y_pred)))
+    down= sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
+    return 1 - (upp / float(down))
+def get_rm2(ys_orig,ys_line):
+    r2 = r_squared_error(ys_orig, ys_line)
+    r02 = squared_error_zero(ys_orig, ys_line)
+    return r2 * (1 - np.sqrt(np.absolute((r2*r2)-(r02*r02))))

util/load_dataset.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from tdc.multi_pred import DTI
+import pandas as pd
+import numpy as np
+if __name__ == '__main__':
+    bindingDB_data = DTI(name = 'BindingDB_Kd')
+    davis_data = DTI(name = 'DAVIS')
+    bindingDB_data.harmonize_affinities(mode = 'max_affinity')
+    bindingDB_data.convert_to_log(form = 'binding')
+    davis_data.convert_to_log(form = 'binding')
+    split_bindingDB = bindingDB_data.get_split()
+    split_davis = davis_data.get_split()
+    dataset_list = ["train", "valid", "test"]
+    for dataset_type in dataset_list:
+        df_bindingDB = pd.DataFrame(split_bindingDB[dataset_type])
+        df_davis = pd.DataFrame(split_davis[dataset_type])
+        df_bindingDB.to_csv(f"../dataset_kd/bindingDB_{dataset_type}.csv", index=False)
+        df_davis.to_csv(f"../dataset_kd/davis_{dataset_type}.csv", index=False)
+    Y_bindingDB = np.array(df_bindingDB.Y)
+    Y_davis = np.array(df_davis.Y)
+    Y_davis_log = [np.log10(Y_davis)]

util/make_external_validation.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import pandas as pd
+if __name__ == '__main__':
+    smiles = pd.read_csv("../dataset/external_smiles.csv")
+    ass = pd.read_csv("../dataset/external_aas.csv")
+    smiles_data = list(np.array(smiles['smiles']))
+    smiles_label = list(np.array(smiles['label'].tolist()))
+    smiles_label = [x.split() for x in smiles_label]
+    ass_data = list(np.array(ass['aas']))
+    cyp_type = list(np.array(ass['CYP_type']))
+    external_dataset = []
+    for smiles_idx in range(0, len(smiles_data)):
+        for ass_idx in range(0, len(ass_data)):
+            external_data = [smiles_data[smiles_idx], ass_data[ass_idx], cyp_type[ass_idx]]
+            external_dataset.append(external_data)
+    df = pd.DataFrame(external_dataset, columns=['smiles', 'aas', 'CYP_type'])
+    df.to_csv('../dataset/external_dataset.csv', index=False)
+    print(smiles['smiles'][0])
+    print(ass['CYP_type'][0])

util/utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import json, copy
+from easydict import EasyDict
+import torch.nn as nn
+class DictX(dict):
+    def __getattr__(self, key):
+        try:
+            return self[key]
+        except KeyError as k:
+            raise AttributeError(k)
+    def __setattr__(self, key, value):
+        self[key] = value
+    def __delattr__(self, key):
+        try:
+            del self[key]
+        except KeyError as k:
+            raise AttributeError(k)
+    def __repr__(self):
+        return '<DictX ' + dict.__repr__(self) + '>'
+def load_hparams(file_path):
+    hparams = EasyDict()
+    with open(file_path, 'r') as f:
+        hparams = json.load(f)
+    return hparams
+def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
+    oldModuleList = model.encoder.layer
+    newModuleList = nn.ModuleList()
+    # Now iterate over all layers, only keepign only the relevant layers.
+    for i in range(num_layers_to_keep):
+        newModuleList.append(oldModuleList[i])
+    # create a copy of the model, modify it with the new list, and return
+    copyOfModel = copy.deepcopy(model)
+    copyOfModel.encoder.layer = newModuleList
+    return copyOfModel