Spaces:

Tarive
/

protein_fold_hack_nation_ai

Sleeping

App Files Files Community

Tarive commited on 22 days ago

Commit

0dee19d

verified ·

1 Parent(s): 254a962

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -40

app.py CHANGED Viewed

@@ -1,64 +1,70 @@
-# app.py (Final Corrected Version)
 import gradio as gr
-from transformers import pipeline
 import pickle
-from huggingface_hub import hf_hub_download # Import the download function
 # =============================================================================
-# 1. LOAD YOUR MODEL AND THE SAVED LABEL ENCODER
 # =============================================================================
 # Define the path to your model repository
 model_path = "Tarive/esm2_t12_35M_UR50D-5k-families-balanced-augmented-weighted_optimized"
-# --- FIX FOR LFS ---
-# Explicitly download the label_encoder.pkl file from the repo.
-# This ensures the app can find the file even if it's stored with Git LFS.
-print("Downloading label encoder...")
-encoder_path = hf_hub_download(repo_id=model_path, filename="label_encoder_5k-2.pkl")
-print("Download complete.")
-# --- END FIX ---
-# Load the classification pipeline
-print("Loading classification pipeline...")
-classifier = pipeline("text-classification", model=model_path)
-print("Pipeline loaded.")
-# Load the label encoder from the path where it was downloaded
-print("Loading label encoder...")
 with open(encoder_path, "rb") as f:
     label_encoder = pickle.load(f)
 print("Label encoder loaded.")
 # =============================================================================
-# 2. DEFINE THE PREDICTION FUNCTION WITH LABEL DECODING
 # =============================================================================
-# This function now decodes the labels before displaying them.
 def predict_family(sequence):
-    # Get the top 5 predictions from the model
-    predictions = classifier(sequence, top_k=5)
-    # The model outputs labels like "LABEL_455". We need to extract the number.
-    results = {}
-    for p in predictions:
-        try:
-            # Extract the number from the label string (e.g., "LABEL_455" -> 455)
-            label_index = int(p['label'].split('_')[1])
-            # Use the label_encoder to find the original family name
-            original_label = label_encoder.inverse_transform([label_index])[0]
-            # Store the real name and score
-            results[original_label] = p['score']
-        except (ValueError, IndexError):
-            # Handle cases where the label format is unexpected
-            results[p['label']] = p['score']
     return results
 # =============================================================================
-# 3. CREATE THE GRADIO INTERFACE
 # =============================================================================
 print("Creating Gradio interface...")
 iface = gr.Interface(
@@ -77,11 +83,10 @@ iface = gr.Interface(
     examples=[
         ["MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR"],
         ["MTEYKLVVVGAGDVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVEVDCQQCMILDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGGCMS"],
-        ["MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA"]
     ],
-    allow_flagging="never" # Disables the "Flag" button for a cleaner interface
 )
-print("Interface created.")
 # Launch the interface!
 print("Launching app...")

+# app.py (Final, Robust Version)
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
 import pickle
+from huggingface_hub import hf_hub_download
 # =============================================================================
+# 1. LOAD MODEL, TOKENIZER, AND LABEL ENCODER
 # =============================================================================
 # Define the path to your model repository
 model_path = "Tarive/esm2_t12_35M_UR50D-5k-families-balanced-augmented-weighted_optimized"
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+print("Loading model...")
+model = AutoModelForSequenceClassification.from_pretrained(model_path)
+# Move model to GPU if available for faster inference
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+print(f"Model loaded on device: {device}")
+# Download and load the label encoder
+print("Downloading and loading label encoder...")
+encoder_path = hf_hub_download(repo_id=model_path, filename="label_encoder_5k-2.pkl")
 with open(encoder_path, "rb") as f:
     label_encoder = pickle.load(f)
 print("Label encoder loaded.")
 # =============================================================================
+# 2. DEFINE THE LOW-LEVEL PREDICTION FUNCTION
 # =============================================================================
+# This function manually replicates the training data processing steps.
 def predict_family(sequence):
+    # 1. Tokenize the input sequence with the exact same settings as training
+    inputs = tokenizer(
+        sequence,
+        return_tensors="pt", # Return PyTorch tensors
+        truncation=True,
+        padding=True,
+        max_length=256 # Ensure this matches your training max_length
+    ).to(device) # Move tokenized inputs to the same device as the model
+    # 2. Get model predictions (logits)
+    with torch.no_grad(): # Disable gradient calculation for efficiency
+        logits = model(**inputs).logits
+    # 3. Get the top 5 predictions
+    top_k_indices = torch.topk(logits, 5, dim=-1).indices.squeeze().tolist()
+    # 4. Convert logits to probabilities (softmax)
+    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
+    # 5. Decode the numerical labels back to family names
+    results = {}
+    for index in top_k_indices:
+        family_name = label_encoder.inverse_transform([index])[0]
+        confidence_score = probabilities[index]
+        results[family_name] = confidence_score
     return results
 # =============================================================================
+# 3. CREATE THE GRADIO INTERFACE (No changes here)
 # =============================================================================
 print("Creating Gradio interface...")
 iface = gr.Interface(
     examples=[
         ["MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR"],
         ["MTEYKLVVVGAGDVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVEVDCQQCMILDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGGCMS"],
+        ["MSIKKILVSDKITTLEKFPASVTLDGADFTVHSSWYDTEKVREDIKEKYSHLISESENGFLFKEKDSKRFWRYFNEKDGVSYATGYQINPYFPANKKYEFGYTGAEWYYSYEPKNVARYGNFDETDAAHPCTYTVANYYLRDKSYFDDKYFNVPLYNMFFNDYNYYDFEYQTKNKFYFTNYKENPKYPFETNFENVPSKDTDDYIIKPYPGVKKFGEFDWDEFEGNTFDPGYYKDSYMYYQKKYDDSYKYKEYGVDPDDFSYKDKYDNNPKFNLYYKYVPDKKNN"]
     ],
+    allow_flagging="never"
 )
 # Launch the interface!
 print("Launching app...")