GiusMagi commited on
Commit
ff7bcc1
·
verified ·
1 Parent(s): 5d0c39f

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +14 -0
  2. app.py +85 -0
  3. final_report.csv +0 -0
  4. model_pipeline.py +311 -0
  5. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ build-essential && rm -rf /var/lib/apt/lists/*
5
+
6
+ WORKDIR /app
7
+
8
+ COPY requirements.txt ./
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ COPY . /app
12
+
13
+ ENV PORT=7860
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "info"]
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time, logging, json, traceback
2
+ from typing import Optional, Dict, Any
3
+ from fastapi import FastAPI, HTTPException
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from pydantic import BaseModel, Field
6
+ from model_pipeline import Predictor, FEATURE_MAP
7
+
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
11
+ )
12
+ log = logging.getLogger("api")
13
+
14
+ # ----------- input model -----------
15
+ class PredictIn(BaseModel):
16
+ include_neg: bool = False
17
+ Debitore_cluster: Optional[str] = None
18
+ Stato_Giudizio: Optional[str] = None
19
+ Cedente: Optional[str] = None
20
+
21
+ # alias con spazi/punti
22
+ Importo_iniziale_outstanding: Optional[float] = Field(None, alias="Importo iniziale outstanding")
23
+ Decreto_sospeso: Optional[str] = Field(None, alias="Decreto sospeso")
24
+ Notifica_Decreto: Optional[str] = Field(None, alias="Notifica Decreto")
25
+ Opposizione_al_decreto_ingiuntivo: Optional[str] = Field(None, alias="Opposizione al decreto ingiuntivo")
26
+ Ricorso_al_TAR: Optional[str] = Field(None, alias="Ricorso al TAR")
27
+ Sentenza_TAR: Optional[str] = Field(None, alias="Sentenza TAR")
28
+ Atto_di_Precetto: Optional[str] = Field(None, alias="Atto di Precetto")
29
+ Decreto_Ingiuntivo: Optional[str] = Field(None, alias="Decreto Ingiuntivo")
30
+ Sentenza_giudizio_opposizione: Optional[str] = Field(None, alias="Sentenza giudizio opposizione")
31
+ giorni_da_iscrizione: Optional[int] = None
32
+ giorni_da_cessione: Optional[int] = None
33
+ Zona: Optional[str] = None
34
+
35
+ model_config = {"populate_by_name": True, "extra": "allow"}
36
+
37
+ # ----------- app -----------
38
+ app = FastAPI(title="Predizione+SHAP API", version="1.0.0")
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
42
+ )
43
+
44
+ t0 = time.time()
45
+ predictor: Predictor | None = None
46
+
47
+ @app.on_event("startup")
48
+ def _load_model():
49
+ global predictor
50
+ predictor = Predictor()
51
+ log.info(f"Model loaded in {predictor.load_seconds:.2f}s")
52
+
53
+ @app.get("/health")
54
+ def health():
55
+ return {"ok": predictor is not None, "uptime_s": time.time()-t0}
56
+
57
+ @app.post("/predict")
58
+ def predict(inp: PredictIn):
59
+ if predictor is None:
60
+ raise HTTPException(503, "Model not ready")
61
+
62
+ # ricomponi payload secondo i nomi originali delle feature
63
+ payload: Dict[str, Any] = {}
64
+ for k in FEATURE_MAP.values():
65
+ ak = k.replace(" ", "_").replace(".", "_")
66
+ payload[k] = getattr(inp, ak, None)
67
+ payload["include_neg"] = inp.include_neg
68
+
69
+ try:
70
+ out = predictor.predict_dict(payload, include_neg=inp.include_neg)
71
+
72
+ # assicura chiave 'class' (nessuna alias confusion)
73
+ if "class_" in out and "class" not in out:
74
+ out["class"] = out.pop("class_")
75
+
76
+ log.info(json.dumps({
77
+ "event":"predict_ok",
78
+ "class": out.get("class"),
79
+ "stage": out.get("stage_used"),
80
+ "p100": round(out.get("p100", 0.0), 4)
81
+ }))
82
+ return out
83
+ except Exception as e:
84
+ log.exception("predict_error")
85
+ raise HTTPException(500, f"Prediction error: {e}") from e
final_report.csv ADDED
The diff for this file is too large to render. See raw diff
 
model_pipeline.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, pandas as pd, warnings, time, uuid
2
+ warnings.filterwarnings("ignore")
3
+
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.base import BaseEstimator, ClassifierMixin
6
+ import xgboost as xgb
7
+ import shap
8
+
9
+ # -------------------- CONFIG --------------------
10
+ DATA_PATH = "/app/data/final_report.csv" # <— assicurati che il file esista nel container!
11
+
12
+ FEATURE_MAP = {
13
+ "Debitore_cluster": "Debitore_cluster",
14
+ "Stato_Giudizio": "Stato_Giudizio",
15
+ "Cedente": "Cedente",
16
+ "Importo.iniziale.outstanding": "Importo iniziale outstanding",
17
+ "Decreto.sospeso": "Decreto sospeso",
18
+ "Notifica.Decreto": "Notifica Decreto",
19
+ "Opposizione.al.decreto.ingiuntivo": "Opposizione al decreto ingiuntivo",
20
+ "Ricorso.al.TAR": "Ricorso al TAR",
21
+ "Sentenza.TAR": "Sentenza TAR",
22
+ "Atto.di.Precetto": "Atto di Precetto",
23
+ "Decreto.Ingiuntivo": "Decreto Ingiuntivo",
24
+ "Sentenza.giudizio.opposizione": "Sentenza giudizio opposizione",
25
+ "giorni_da_iscrizione": "giorni_da_iscrizione",
26
+ "giorni_da_cessione": "giorni_da_cessione",
27
+ "Zona": "Zona"
28
+ }
29
+
30
+ LABELS = ["quasi_nulla","bassa","media","alta"]
31
+ BINS = [0, 11, 30, 70, 100]
32
+ MIDPOINTS = np.array([5.5, 20.5, 50.0, 85.0])
33
+
34
+ MONTH_BINS_DAYS = np.array([0, 30, 60, 90, 180, 360, 720, 1e9], dtype=float)
35
+ MONTH_LABELS = ["<1m","1–2m","2–3m","3–6m","6–12m","12–24m",">=24m"]
36
+
37
+ IMPORTO_BINS = [0.0,1_000.0,10_000.0,50_000.0,100_000.0,500_000.0,1_000_000.0,2_000_000.0]
38
+ IMPORTO_LABELS = ["<1k","1–10k","10–50k","50–100k","100–500k","500k–1M",">=1M"]
39
+
40
+ RANDOM_STATE = 42
41
+ P100_THR_AUTO = 0.71
42
+
43
+ STAGE1_LOGIT_PARAMS = dict(max_iter=500, solver='liblinear')
44
+ STAGE2_ORD_XGB_PARAMS = dict(
45
+ objective="binary:logistic", n_estimators=700, learning_rate=0.05,
46
+ max_depth=4, subsample=0.9, colsample_bytree=0.85,
47
+ min_child_weight=2.0, gamma=0.1, reg_lambda=5.0, reg_alpha=0.5,
48
+ n_jobs=-1, random_state=RANDOM_STATE, verbosity=0
49
+ )
50
+
51
+ TOP_K_TEXT = 3
52
+ MIN_ABS_SHOW = 0.01
53
+ TOP_K_ONELINER = 2
54
+
55
+ # -------------------- MODELLO ORDINATO --------------------
56
+ class OrdinalXGB(BaseEstimator, ClassifierMixin):
57
+ def __init__(self, n_classes=4, **xgb_params):
58
+ self.n_classes = n_classes
59
+ self.xgb_params = xgb_params
60
+ self.models = []
61
+ def fit(self, X, y, sample_weight=None):
62
+ self.models = []
63
+ for k in range(1, self.n_classes):
64
+ y_bin = (y >= k).astype(int)
65
+ clf = xgb.XGBClassifier(**self.xgb_params)
66
+ clf.fit(X, y_bin, sample_weight=sample_weight)
67
+ self.models.append(clf)
68
+ return self
69
+ def _cum_probs(self, X):
70
+ cps = np.vstack([clf.predict_proba(X)[:,1] for clf in self.models]).T
71
+ cps = np.clip(cps, 1e-6, 1-1e-6)
72
+ for k in range(1, cps.shape[1]): cps[:,k] = np.minimum(cps[:,k-1], cps[:,k])
73
+ return cps
74
+ def predict_proba(self, X):
75
+ cps = self._cum_probs(X); n = X.shape[0]
76
+ proba = np.zeros((n, self.n_classes))
77
+ proba[:,0] = 1 - cps[:,0]
78
+ for k in range(1, self.n_classes-1): proba[:,k] = cps[:,k-1] - cps[:,k]
79
+ proba[:,-1] = cps[:,-1]
80
+ s = proba.sum(axis=1, keepdims=True); s[s==0]=1.0
81
+ return np.clip(proba/s, 0, 1)
82
+
83
+ def mode_(s: pd.Series):
84
+ s = s.dropna()
85
+ return s.mode().iloc[0] if len(s) else np.nan
86
+
87
+ class Predictor:
88
+ def __init__(self, data_path=DATA_PATH):
89
+ t0 = time.time()
90
+ self.data_path = data_path
91
+
92
+ df = pd.read_csv(self.data_path)
93
+ inc = df['incassi_perc'].replace([np.inf,-np.inf], np.nan).fillna(100.0).clip(0,100)
94
+ df_model = df[[v for v in FEATURE_MAP.values() if v in df.columns]].copy()
95
+ df_model['incassi_perc_capped'] = inc
96
+ df_model['y100'] = (inc >= 100.0-1e-9).astype(int)
97
+ df_model['livello'] = pd.cut(np.minimum(inc, 99.999), bins=BINS, labels=LABELS, right=False, include_lowest=True)
98
+
99
+ self.num_cols, self.cat_cols = [], []
100
+ for c in FEATURE_MAP.values():
101
+ if c in df_model.columns:
102
+ (self.num_cols if pd.api.types.is_numeric_dtype(df_model[c]) else self.cat_cols).append(c)
103
+
104
+ self.params, full_oh = self.preprocess_fit(df_model)
105
+ self.feat_cols_full = [c for c in full_oh.columns if c not in ['incassi_perc_capped','y100','livello']]
106
+
107
+ self.stage1_final = LogisticRegression(**STAGE1_LOGIT_PARAMS).fit(full_oh[self.feat_cols_full], full_oh['y100'])
108
+ full_lt = full_oh[full_oh['y100']==0].copy()
109
+ y_ord_full = pd.Categorical(full_lt['livello'], categories=LABELS, ordered=True).codes
110
+ self.stage2_final = OrdinalXGB(n_classes=4, **STAGE2_ORD_XGB_PARAMS).fit(full_lt[self.feat_cols_full].values, y_ord_full)
111
+
112
+ shap.initjs()
113
+ rng = np.random.RandomState(0)
114
+ bg_idx = rng.choice(len(full_oh), size=min(200, len(full_oh)), replace=False)
115
+ bg_matrix = full_oh.iloc[bg_idx][self.feat_cols_full].values
116
+ self.explainer_st1 = shap.LinearExplainer(self.stage1_final, bg_matrix, link=shap.links.identity)
117
+ self.explainers_st2 = [shap.TreeExplainer(clf, bg_matrix, model_output="probability",
118
+ feature_perturbation="interventional")
119
+ for clf in self.stage2_final.models]
120
+
121
+ self.ORIGINAL_CAT_COLS = [c for c in self.cat_cols]
122
+ self.load_seconds = time.time()-t0
123
+
124
+ def preprocess_fit(self, train_df: pd.DataFrame):
125
+ params = {}
126
+ means = {c: train_df[c].mean(skipna=True) for c in self.num_cols}
127
+ modes = {c: mode_(train_df[c]) for c in self.cat_cols}
128
+ tr = train_df.copy()
129
+ for c in self.num_cols: tr[c] = tr[c].fillna(means[c])
130
+ for c in self.cat_cols: tr[c] = tr[c].fillna(modes[c]).astype(str)
131
+
132
+ one_level = [c for c in self.cat_cols if tr[c].nunique(dropna=True) < 2]
133
+ keep_cats = [c for c in self.cat_cols if c not in one_level]
134
+ params['removed_cats'] = one_level
135
+
136
+ params['month_bins_days'] = MONTH_BINS_DAYS.tolist()
137
+ params['month_labels'] = MONTH_LABELS
138
+ tr['iscr_month_bin'] = pd.cut(tr['giorni_da_iscrizione'], MONTH_BINS_DAYS, labels=MONTH_LABELS, right=False, include_lowest=True)
139
+ tr['cess_month_bin'] = pd.cut(tr['giorni_da_cessione'], MONTH_BINS_DAYS, labels=MONTH_LABELS, right=False, include_lowest=True)
140
+ for c in ['iscr_month_bin','cess_month_bin']:
141
+ if tr[c].nunique(dropna=True) >= 2 and c not in keep_cats:
142
+ keep_cats.append(c)
143
+
144
+ params['importo_bins'] = IMPORTO_BINS
145
+ params['importo_labels'] = IMPORTO_LABELS
146
+ tr['imp_bucket'] = pd.cut(tr['Importo iniziale outstanding'], IMPORTO_BINS, labels=IMPORTO_LABELS, right=False, include_lowest=True)
147
+ if tr['imp_bucket'].nunique(dropna=True) >= 2 and 'imp_bucket' not in keep_cats:
148
+ keep_cats.append('imp_bucket')
149
+
150
+ params['keep_cats'] = keep_cats
151
+ params['levels_map'] = {c: sorted(tr[c].astype(str).dropna().unique().tolist()) for c in keep_cats}
152
+
153
+ x_imp_log = np.log1p(tr['Importo iniziale outstanding'].clip(lower=0))
154
+ params['scale_imp'] = (x_imp_log.mean(), x_imp_log.std(ddof=0) or 1.0)
155
+ tr['x_imp_log'] = (x_imp_log - params['scale_imp'][0]) / params['scale_imp'][1]
156
+
157
+ g_iscr_log = np.log(tr['giorni_da_iscrizione'].clip(lower=1))
158
+ params['scale_iscr'] = (g_iscr_log.mean(), g_iscr_log.std(ddof=0) or 1.0)
159
+ tr['giorni_log'] = (g_iscr_log - params['scale_iscr'][0]) / params['scale_iscr'][1]
160
+
161
+ g_cess = tr['giorni_da_cessione']
162
+ params['scale_cess'] = (g_cess.mean(), g_cess.std(ddof=0) or 1.0)
163
+ tr['giorni_cessione_z'] = (g_cess - params['scale_cess'][0]) / params['scale_cess'][1]
164
+
165
+ tr = tr.drop(columns=['Importo iniziale outstanding','giorni_da_iscrizione','giorni_da_cessione'])
166
+ tr_oh = pd.get_dummies(tr, columns=keep_cats, drop_first=True, dtype=float)
167
+ params['oh_columns'] = [c for c in tr_oh.columns if c not in ['incassi_perc_capped','y100','livello']]
168
+ params['means'] = means
169
+ params['modes'] = modes
170
+ return params, tr_oh
171
+
172
+ def preprocess_apply(self, test_df: pd.DataFrame):
173
+ te = test_df.copy()
174
+ for c in self.num_cols: te[c] = te[c].fillna(self.params['means'][c])
175
+ for c in self.cat_cols: te[c] = te[c].fillna(self.params['modes'][c]).astype(str)
176
+ te = te.drop(columns=self.params['removed_cats'], errors='ignore')
177
+
178
+ te['iscr_month_bin'] = pd.cut(te['giorni_da_iscrizione'], np.array(self.params['month_bins_days'], float),
179
+ labels=self.params['month_labels'], right=False, include_lowest=True)
180
+ te['cess_month_bin'] = pd.cut(te['giorni_da_cessione'], np.array(self.params['month_bins_days'], float),
181
+ labels=self.params['month_labels'], right=False, include_lowest=True)
182
+ te['imp_bucket'] = pd.cut(te['Importo iniziale outstanding'], np.array(self.params['importo_bins'], float),
183
+ labels=self.params['importo_labels'], right=False, include_lowest=True)
184
+
185
+ x_imp_log = np.log1p(te['Importo iniziale outstanding'].clip(lower=0))
186
+ te['x_imp_log'] = (x_imp_log - self.params['scale_imp'][0]) / self.params['scale_imp'][1]
187
+ g_iscr_log = np.log(te['giorni_da_iscrizione'].clip(lower=1))
188
+ te['giorni_log'] = (g_iscr_log - self.params['scale_iscr'][0]) / self.params['scale_iscr'][1]
189
+ te['giorni_cessione_z'] = (te['giorni_da_cessione'] - self.params['scale_cess'][0]) / self.params['scale_cess'][1]
190
+
191
+ keep_cats = [c for c in self.cat_cols if c not in self.params['removed_cats']]
192
+ for c in ['iscr_month_bin','cess_month_bin','imp_bucket']:
193
+ if c not in keep_cats: keep_cats.append(c)
194
+
195
+ te = te.drop(columns=['Importo iniziale outstanding','giorni_da_iscrizione','giorni_da_cessione'])
196
+ te_oh = pd.get_dummies(te, columns=keep_cats, drop_first=True, dtype=float)
197
+
198
+ for col in self.params['oh_columns']:
199
+ if col not in te_oh.columns: te_oh[col] = 0.0
200
+ extra = [c for c in te_oh.columns if c not in self.params['oh_columns'] + ['incassi_perc_capped','y100','livello']]
201
+ if extra: te_oh = te_oh.drop(columns=extra)
202
+
203
+ target_cols_all = ['incassi_perc_capped','y100','livello']
204
+ target_cols_present = [c for c in target_cols_all if c in te_oh.columns]
205
+ te_oh = te_oh[self.params['oh_columns'] + target_cols_present]
206
+ return te_oh
207
+
208
+ def active_levels_from_raw(self, raw_row: pd.DataFrame):
209
+ out = {}
210
+ s = raw_row.iloc[0]
211
+ for c in self.ORIGINAL_CAT_COLS:
212
+ v = s.get(c, np.nan)
213
+ out[c] = self.params['levels_map'].get(c, ["(baseline)"])[0] if (pd.isna(v) or str(v).strip()=="") else str(v)
214
+ return out
215
+
216
+ def collapse_shap(self, vals_row: np.ndarray, feature_names, active_levels):
217
+ vals_s = pd.Series(vals_row, index=feature_names)
218
+ used=set(); out_vals=[]; out_names=[]
219
+ for cat, levels in self.params['levels_map'].items():
220
+ prefix=f"{cat}_"; cols=[c for c in feature_names if c.startswith(prefix)]
221
+ if not cols: continue
222
+ used.update(cols)
223
+ total=float(vals_s[cols].sum())
224
+ out_vals.append(total); out_names.append(f"{cat} = {active_levels.get(cat, levels[0] if levels else '(baseline)')}")
225
+ for c in feature_names:
226
+ if c in used or c in ["incassi_perc_capped","y100","livello"]: continue
227
+ out_vals.append(float(vals_s[c])); out_names.append(c)
228
+ out_vals=np.array(out_vals); out_names=np.array(out_names)
229
+ idx=np.argsort(-np.abs(out_vals))
230
+ return out_names[idx], out_vals[idx]
231
+
232
+ def explain_text_for_stage1(self, X_row, raw_row):
233
+ vals = self.explainer_st1.shap_values(X_row.reshape(1,-1))
234
+ vals_row = vals[0] if hasattr(vals, "__len__") else vals
235
+ return self.collapse_shap(vals_row, self.feat_cols_full, self.active_levels_from_raw(raw_row))
236
+
237
+ def explain_text_for_stage2(self, X_row, raw_row, k_thr: int):
238
+ vals = self.explainers_st2[k_thr-1].shap_values(X_row.reshape(1,-1))
239
+ vals_row = vals[0] if hasattr(vals, "__len__") else vals
240
+ return self.collapse_shap(vals_row, self.feat_cols_full, self.active_levels_from_raw(raw_row))
241
+
242
+ def summary_from_names_contrib(self, names, contrib, top_k=TOP_K_TEXT, min_abs=MIN_ABS_SHOW, include_neg=False):
243
+ pos = [(n, v) for n, v in zip(names, contrib) if v >= min_abs][:top_k]
244
+ neg = [(n, v) for n, v in zip(names, contrib) if v <= -min_abs][:top_k] if include_neg else []
245
+ def to_dict(items): return [ {"name": n, "delta_pp": float(abs(v))} for n, v in items ]
246
+ return to_dict(pos), to_dict(neg), pos, neg
247
+
248
+ def build_one_liner(self, final_class: str, stage_used: str, p100: float, yhat: float,
249
+ k_thr: int | None, pos_pairs, neg_pairs):
250
+ def short(items):
251
+ take = items[:TOP_K_ONELINER]
252
+ return ", ".join([f"{n} ({abs(v):.0%} pp)" for n, v in take]) if take else "—"
253
+ if stage_used == "stage1":
254
+ up = short([p for p in pos_pairs if p[1] > 0])
255
+ down = short([n for n in neg_pairs if n[1] < 0])
256
+ return (f"Classe **{final_class}**: p(100%)={p100:.0%}. "
257
+ f"Hanno favorito: {up}; hanno penalizzato: {down}. "
258
+ f"Valore atteso {yhat:.1f}.")
259
+ else:
260
+ up = short([p for p in pos_pairs if p[1] > 0])
261
+ down = short([n for n in neg_pairs if n[1] < 0])
262
+ return (f"Classe **{final_class}** (spiegazione su P(y≥{k_thr})): "
263
+ f"in alto {up}; in basso {down}. Valore atteso {yhat:.1f}.")
264
+
265
+ def predict_dict(self, payload: dict, include_neg: bool=False):
266
+ rid = str(uuid.uuid4())
267
+ raw = {k: payload.get(k, None) for k in FEATURE_MAP.values()}
268
+ df_row_raw = pd.DataFrame([raw])
269
+
270
+ te_oh = self.preprocess_apply(df_row_raw)
271
+ X_df = te_oh.reindex(columns=self.feat_cols_full, fill_value=0.0)
272
+ X = X_df.values
273
+
274
+ p100 = float(self.stage1_final.predict_proba(X)[:,1][0])
275
+ prob_ord = self.stage2_final.predict_proba(X)[0]
276
+ prob_ord = prob_ord / (prob_ord.sum() or 1.0)
277
+ yhat = 100.0*p100 + (1.0-p100)*float((prob_ord @ MIDPOINTS))
278
+
279
+ if p100 >= P100_THR_AUTO:
280
+ names, contrib = self.explain_text_for_stage1(X[0], df_row_raw)
281
+ txt_pos, txt_neg, pos_pairs, neg_pairs = self.summary_from_names_contrib(
282
+ names, contrib, include_neg=include_neg
283
+ )
284
+ final_class = "100%"
285
+ stage_used = "stage1"
286
+ k_thr = None
287
+ else:
288
+ k = int(np.argmax(prob_ord)); k_thr = min(max(1, k), 3)
289
+ names, contrib = self.explain_text_for_stage2(X[0], df_row_raw, k_thr=k_thr)
290
+ txt_pos, txt_neg, pos_pairs, neg_pairs = self.summary_from_names_contrib(
291
+ names, contrib, include_neg=include_neg
292
+ )
293
+ final_class = ["quasi_nulla","bassa","media","alta"][k]
294
+ stage_used = "stage2"
295
+
296
+ one_liner = self.build_one_liner(final_class, stage_used, p100, yhat, k_thr, pos_pairs, neg_pairs)
297
+
298
+ return {
299
+ "request_id": rid,
300
+ "stage_used": stage_used,
301
+ "class": final_class,
302
+ "p100": p100,
303
+ "expected_value": yhat,
304
+ "ordinal_probs": {LABELS[i]: float(prob_ord[i]) for i in range(len(LABELS))},
305
+ "k_thr": k_thr,
306
+ "shap": {
307
+ "positivi_top": txt_pos, # punti di probabilità (0..1)
308
+ "negativi_top": txt_neg if include_neg else []
309
+ },
310
+ "one_liner": one_liner
311
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ fastapi==0.115.6
3
+ uvicorn[standard]==0.30.6
4
+ pydantic==2.8.2
5
+ numpy==1.26.4
6
+ pandas==2.2.2
7
+ scikit-learn==1.4.2
8
+ xgboost==2.0.3
9
+ shap==0.45.1