File size: 5,268 Bytes
85c36de
942bf87
51a3749
ea9a1bf
e199881
51a3749
f0f9b27
 
f776418
f0f9b27
51a3749
68ded6f
e199881
8bc43cc
942bf87
f776418
f0f9b27
 
 
68ded6f
f0f9b27
68ded6f
f776418
dc9275e
f776418
 
 
 
 
 
 
 
 
 
3b84715
aa6838a
f0f9b27
 
68ded6f
c63f76d
68ded6f
bee2eef
68ded6f
 
8319384
bee2eef
 
8319384
f3b700a
f0f9b27
68ded6f
 
f0f9b27
9748994
68ded6f
 
0c1f1e9
 
f776418
0c1f1e9
 
 
 
f0f9b27
 
0c1f1e9
48574c5
 
 
 
0c1f1e9
 
68ded6f
0c1f1e9
68ded6f
 
48574c5
68ded6f
 
 
 
0c1f1e9
68ded6f
0c1f1e9
 
f776418
68ded6f
357b75d
f776418
48574c5
f776418
357b75d
 
68ded6f
 
e9f8ebb
 
f776418
 
e9f8ebb
 
f776418
 
e9f8ebb
f776418
e9f8ebb
f776418
 
 
 
 
 
 
 
 
68ded6f
f776418
e9f8ebb
f776418
68ded6f
 
 
f776418
 
 
68ded6f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
from sklearn.preprocessing import MinMaxScaler
import torch
from transformers import BertTokenizer, BertModel
from lime.lime_tabular import LimeTabularExplainer
from math import expm1

# Load AMP Classifier
model = joblib.load("RF.joblib")
scaler = joblib.load("norm (4).joblib")

# Load ProtBert
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
protbert_model = protbert_model.to(device).eval()

# Selected Features
selected_features = [ ... ]  # keep your full selected_features list here

# LIME Explainer Setup
sample_data = np.random.rand(100, len(selected_features))
explainer = LimeTabularExplainer(
    training_data=sample_data,
    feature_names=selected_features,
    class_names=["AMP", "Non-AMP"],
    mode="classification"
)

# Feature Extractor
def extract_features(sequence):
    all_features_dict = {}
    sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
    if len(sequence) < 10:
        return "Error: Sequence too short."
    dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
    filtered_dipeptide_features = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
    ctd_features = CTD.CalculateCTD(sequence)
    auto_features = Autocorrelation.CalculateAutoTotal(sequence)
    pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
    all_features_dict.update(ctd_features)
    all_features_dict.update(filtered_dipeptide_features)
    all_features_dict.update(auto_features)
    all_features_dict.update(pseudo_features)
    feature_df_all = pd.DataFrame([all_features_dict])
    normalized_array = scaler.transform(feature_df_all.values)
    normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
    selected_df = normalized_df[selected_features].fillna(0)
    return selected_df.values

# MIC Predictor
def predictmic(sequence):
    sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
    if len(sequence) < 10:
        return {"Error": "Sequence too short or invalid."}
    seq_spaced = ' '.join(list(sequence))
    tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        outputs = protbert_model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().reshape(1, -1)
    bacteria_config = {
        "E.coli": {"model": "coli_xgboost_model.pkl", "scaler": "coli_scaler.pkl", "pca": None},
        "S.aureus": {"model": "aur_xgboost_model.pkl", "scaler": "aur_scaler.pkl", "pca": None},
        "P.aeruginosa": {"model": "arg_xgboost_model.pkl", "scaler": "arg_scaler.pkl", "pca": None},
        "K.Pneumonia": {"model": "pne_mlp_model.pkl", "scaler": "pne_scaler.pkl", "pca": "pne_pca.pkl"}
    }
    mic_results = {}
    for bacterium, cfg in bacteria_config.items():
        try:
            scaler = joblib.load(cfg["scaler"])
            scaled = scaler.transform(embedding)
            transformed = joblib.load(cfg["pca"]).transform(scaled) if cfg["pca"] else scaled
            model = joblib.load(cfg["model"])
            mic_log = model.predict(transformed)[0]
            mic = round(expm1(mic_log), 3)
            mic_results[bacterium] = mic
        except Exception as e:
            mic_results[bacterium] = f"Error: {str(e)}"
    return mic_results

# Full Prediction with LIME Explanation
def full_prediction(sequence):
    features = extract_features(sequence)
    if isinstance(features, str):  # error
        return features

    prediction = model.predict(features)[0]
    probabilities = model.predict_proba(features)[0]
    amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
    confidence = round(probabilities[0 if prediction == 0 else 1] * 100, 2)
    
    result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"

    if prediction == 0:
        mic_values = predictmic(sequence)
        result += "\nPredicted MIC Values (µM):\n"
        for org, mic in mic_values.items():
            result += f"- {org}: {mic}\n"
    else:
        result += "\nMIC prediction skipped for Non-AMP sequences.\n"
    
    # LIME explanation
    explanation = explainer.explain_instance(
        data_row=features[0],
        predict_fn=model.predict_proba,
        num_features=10
    )
    result += "\nTop Features Influencing AMP Prediction:\n"
    for feat, weight in explanation.as_list():
        result += f"- {feat}: {round(weight, 4)}\n"

    return result

# Gradio UI
iface = gr.Interface(
    fn=full_prediction,
    inputs=gr.Textbox(label="Enter Protein Sequence"),
    outputs=gr.Textbox(label="Prediction + MIC + LIME"),
    title="AMP & MIC Predictor + LIME Explanation",
    description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
)

iface.launch(share=True)