nonzeroexit commited on
Commit
7d97f16
·
verified ·
1 Parent(s): 8319384

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -16
app.py CHANGED
@@ -46,8 +46,8 @@ selected_features = [
46
  ]
47
 
48
  def extract_features(sequence):
49
- """Extract selected features and normalize them."""
50
- if len(sequence) <= 9: # Ensure sequence is long enough for PseudoAAC with lamda=9
51
  return "Error: Protein sequence must be longer than 9 amino acids to extract features (for lamda=9)."
52
 
53
  all_features_dict = {}
@@ -61,31 +61,36 @@ def extract_features(sequence):
61
  ctd_features = CTD.CalculateCTD(sequence)
62
  all_features_dict.update(ctd_features)
63
 
64
- pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9) # Set lamda=9
65
  all_features_dict.update(pseudo_features)
66
 
 
 
 
 
 
 
 
 
 
67
 
68
- feature_values = list(all_features_dict.values())
69
- feature_array = np.array(feature_values).reshape(-1, 1)
70
- normalized_features = scaler.transform(feature_array.T)
71
- normalized_features = normalized_features.flatten()
72
 
73
- selected_feature_dict = {}
74
- for i, feature in enumerate(selected_features):
75
- if feature in all_features_dict:
76
- selected_feature_dict[feature] = normalized_features[i]
77
 
78
- selected_feature_df = pd.DataFrame([selected_feature_dict])
79
- selected_feature_array = selected_feature_df.T.to_numpy()
80
 
81
- return selected_feature_array
 
 
 
82
 
83
 
84
  def predict(sequence):
85
  """Predicts whether the input sequence is an AMP."""
86
  features = extract_features(sequence)
87
- if isinstance(features, str) and features.startswith("Error:"): # Check if extract_features returned an error message
88
- return features # Return the error message directly
89
 
90
  prediction = model.predict(features)[0]
91
  probabilities = model.predict_proba(features)[0]
 
46
  ]
47
 
48
  def extract_features(sequence):
49
+ """Extract selected features, ensure order matches trained features, and normalize them."""
50
+ if len(sequence) <= 9:
51
  return "Error: Protein sequence must be longer than 9 amino acids to extract features (for lamda=9)."
52
 
53
  all_features_dict = {}
 
61
  ctd_features = CTD.CalculateCTD(sequence)
62
  all_features_dict.update(ctd_features)
63
 
64
+ pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
65
  all_features_dict.update(pseudo_features)
66
 
67
+ # Create an ordered list of feature values based on selected_features
68
+ ordered_feature_values = []
69
+ missing_features = []
70
+ for feature_name in selected_features:
71
+ if feature_name in all_features_dict:
72
+ ordered_feature_values.append(all_features_dict[feature_name])
73
+ else:
74
+ missing_features.append(feature_name)
75
+ ordered_feature_values.append(0) # Pad with 0 for missing features - important for consistent input size
76
 
77
+ if missing_features:
78
+ print(f"Warning: The following features were missing from extraction and padded with 0: {missing_features}")
 
 
79
 
 
 
 
 
80
 
81
+ feature_array = np.array(ordered_feature_values).reshape(1, -1) # Reshape to (1, n_features) for single sample
 
82
 
83
+ normalized_features = scaler.transform(feature_array) # Normalize the ordered feature array
84
+
85
+
86
+ return normalized_features # Return the normalized features as a 2D numpy array
87
 
88
 
89
  def predict(sequence):
90
  """Predicts whether the input sequence is an AMP."""
91
  features = extract_features(sequence)
92
+ if isinstance(features, str) and features.startswith("Error:"):
93
+ return features
94
 
95
  prediction = model.predict(features)[0]
96
  probabilities = model.predict_proba(features)[0]