Tarive's picture
Update app.py
2132cad verified
raw
history blame
2.77 kB
# app.py (Updated Version)
import gradio as gr
from transformers import pipeline
import pickle
# =============================================================================
# 1. LOAD YOUR MODEL AND THE SAVED LABEL ENCODER
# =============================================================================
# Define the path to your model repository
model_path = "Tarive/esm2_t12_35M_UR50D-finetuned-pfam-1k" # Make sure this is correct
# Load the classification pipeline
classifier = pipeline("text-classification", model=model_path)
# Load the label encoder from the file you uploaded
with open("label_encoder.pkl", "rb") as f:
label_encoder = pickle.load(f)
# =============================================================================
# 2. DEFINE THE PREDICTION FUNCTION WITH LABEL DECODING
# =============================================================================
# This function now decodes the labels before displaying them.
def predict_family(sequence):
# Get the top 5 predictions from the model
predictions = classifier(sequence, top_k=5)
# The model outputs labels like "LABEL_455". We need to extract the number.
results = {}
for p in predictions:
# Extract the number from the label string (e.g., "LABEL_455" -> 455)
label_index = int(p['label'].split('_')[1])
# Use the label_encoder to find the original family name
original_label = label_encoder.inverse_transform([label_index])[0]
# Store the real name and score
results[original_label] = p['score']
return results
# =============================================================================
# 3. CREATE THE GRADIO INTERFACE (No changes here)
# =============================================================================
iface = gr.Interface(
fn=predict_family,
inputs=gr.Textbox(
lines=10,
label="Protein Amino Acid Sequence",
placeholder="Paste your protein sequence here..."
),
outputs=gr.Label(
num_top_classes=5,
label="Predicted Families"
),
title="Protein Family Classifier",
description="This demo uses a fine-tuned ESM-2 model to predict the protein family from its amino acid sequence. Enter a sequence to see the top 5 predictions and their confidence scores.",
examples=[
["MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR"],
["MTEYKLVVVGAGDVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVEVDCQQCMILDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGGCMS"]
],
allow_flagging="never"
)
# Launch the interface!
iface.launch()