Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
import joblib
|
| 4 |
import torch
|
| 5 |
import numpy as np
|
| 6 |
-
import html
|
| 7 |
from transformers import AutoTokenizer, AutoModel, logging as hf_logging
|
| 8 |
import pandas as pd
|
| 9 |
import matplotlib
|
|
@@ -17,7 +17,7 @@ hf_logging.set_verbosity_error()
|
|
| 17 |
|
| 18 |
MODEL_NAME = "bert-base-uncased"
|
| 19 |
DEVICE = "cpu"
|
| 20 |
-
SAVE_DIR = "μ μ₯μ μ₯1"
|
| 21 |
LAYER_ID = 4
|
| 22 |
SEED = 0
|
| 23 |
CLF_NAME = "linear"
|
|
@@ -127,14 +127,14 @@ def create_empty_plotly_figure(message="N/A"):
|
|
| 127 |
)
|
| 128 |
return fig
|
| 129 |
|
| 130 |
-
# --- Core Analysis Function (returns
|
| 131 |
def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
| 132 |
if not MODELS_LOADED_SUCCESSFULLY:
|
| 133 |
-
|
| 134 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 135 |
empty_fig = create_empty_plotly_figure("Model Loading Failed")
|
| 136 |
error_label_output = {"Status": "Error", "Message": "Model Loading Failed. Check logs."}
|
| 137 |
-
return
|
| 138 |
|
| 139 |
try:
|
| 140 |
tokenizer, model = TOKENIZER_GLOBAL, MODEL_GLOBAL
|
|
@@ -147,7 +147,7 @@ def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
|
| 147 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 148 |
empty_fig = create_empty_plotly_figure("Invalid Input")
|
| 149 |
error_label_output = {"Status": "Error", "Message": "Invalid input, no valid tokens."}
|
| 150 |
-
return
|
| 151 |
|
| 152 |
input_embeds_detached = model.embeddings.word_embeddings(input_ids).clone().detach()
|
| 153 |
input_embeds_for_grad = input_embeds_detached.clone().requires_grad_(True)
|
|
@@ -167,7 +167,7 @@ def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
|
| 167 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 168 |
empty_fig = create_empty_plotly_figure("Gradient Error")
|
| 169 |
error_label_output = {"Status": "Error", "Message": "Gradient calculation failed."}
|
| 170 |
-
return
|
| 171 |
|
| 172 |
grads = input_embeds_for_grad.grad.clone().detach()
|
| 173 |
scores = (grads * input_embeds_detached).norm(dim=2).squeeze(0)
|
|
@@ -180,7 +180,8 @@ def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
|
| 180 |
actual_scores_np = scores_np[:len(actual_tokens)]
|
| 181 |
actual_input_embeds = input_embeds_detached[0, :len(actual_tokens), :].cpu().numpy()
|
| 182 |
|
| 183 |
-
|
|
|
|
| 184 |
cls_token_id, sep_token_id = tokenizer.cls_token_id, tokenizer.sep_token_id
|
| 185 |
|
| 186 |
for i, tok_str in enumerate(actual_tokens):
|
|
@@ -190,15 +191,10 @@ def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
|
| 190 |
current_token_id = input_ids[0, i].item()
|
| 191 |
|
| 192 |
if current_token_id == cls_token_id or current_token_id == sep_token_id:
|
| 193 |
-
html_tokens_list.append(f"<span style='font-weight:bold;'>{html.escape(clean_tok_str)}</span>")
|
| 194 |
highlighted_text_data.append((clean_tok_str + " ", None))
|
| 195 |
else:
|
| 196 |
-
color = f"rgba(220, 50, 50, {current_score_clipped:.2f})"
|
| 197 |
-
html_tokens_list.append(f"<span style='background-color:{color}; color:white; padding: 1px 3px; margin: 1px; border-radius: 4px; display:inline-block;'>{html.escape(clean_tok_str)}</span>")
|
| 198 |
highlighted_text_data.append((clean_tok_str + " ", round(current_score_clipped, 3)))
|
| 199 |
|
| 200 |
-
html_output_str = " ".join(html_tokens_list).replace(" ##", "")
|
| 201 |
-
|
| 202 |
top_tokens_for_df, top_tokens_for_barplot_list = [], []
|
| 203 |
valid_indices = [idx for idx, token_id in enumerate(input_ids[0,:len(actual_tokens)].tolist())
|
| 204 |
if token_id not in [cls_token_id, sep_token_id]]
|
|
@@ -230,22 +226,22 @@ def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
|
| 230 |
token_embeddings_3d = pca.fit_transform(pca_embeddings)
|
| 231 |
pca_fig = plot_token_pca_3d_plotly(token_embeddings_3d, pca_tokens, pca_scores_for_plot)
|
| 232 |
|
| 233 |
-
return (
|
| 234 |
prediction_summary_text, prediction_details_for_label,
|
| 235 |
top_tokens_for_df, barplot_df,
|
| 236 |
-
pca_fig)
|
| 237 |
|
| 238 |
except Exception as e:
|
| 239 |
import traceback
|
| 240 |
tb_str = traceback.format_exc()
|
| 241 |
-
|
| 242 |
print(f"analyze_sentence_for_gradio error: {e}\n{tb_str}")
|
| 243 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 244 |
empty_fig = create_empty_plotly_figure("Analysis Error")
|
| 245 |
error_label_output = {"Status": "Error", "Message": f"Analysis failed: {str(e)}"}
|
| 246 |
-
return
|
| 247 |
|
| 248 |
-
# --- Gradio UI Definition (
|
| 249 |
theme = gr.themes.Monochrome(
|
| 250 |
primary_hue=gr.themes.colors.blue,
|
| 251 |
secondary_hue=gr.themes.colors.sky,
|
|
@@ -262,7 +258,6 @@ with gr.Blocks(title="AI Sentence Analyzer XAI π", theme=theme, css=".gradio-
|
|
| 262 |
gr.Markdown("Analyze English sentences to understand BERT model predictions through various XAI visualization techniques. "
|
| 263 |
"Explore token importance and their distribution in the embedding space.")
|
| 264 |
|
| 265 |
-
# Inputs and Summary Outputs Row
|
| 266 |
with gr.Row(equal_height=False):
|
| 267 |
with gr.Column(scale=1, min_width=350):
|
| 268 |
with gr.Group():
|
|
@@ -278,18 +273,13 @@ with gr.Blocks(title="AI Sentence Analyzer XAI π", theme=theme, css=".gradio-
|
|
| 278 |
with gr.Accordion("β Top-K Important Tokens (Table)", open=True):
|
| 279 |
output_top_tokens_df = gr.DataFrame(headers=["Token", "Score"], label="Most Important Tokens",
|
| 280 |
row_count=(1,"dynamic"), col_count=(2,"fixed"), interactive=False, wrap=True)
|
| 281 |
-
gr.Markdown("---")
|
| 282 |
|
| 283 |
-
# Visualization Section Title
|
| 284 |
gr.Markdown("## π Detailed Visualizations")
|
|
|
|
|
|
|
| 285 |
|
| 286 |
-
|
| 287 |
-
with gr.Group():
|
| 288 |
-
gr.Markdown("### π¨ HTML Highlight (Custom)")
|
| 289 |
-
output_html_visualization = gr.HTML(label="Token Importance (Gradient x Input based)")
|
| 290 |
-
|
| 291 |
-
# Highlighted Text (Gradio) - Full Width
|
| 292 |
-
with gr.Group():
|
| 293 |
gr.Markdown("### ποΈ Highlighted Text (Gradio)")
|
| 294 |
output_highlighted_text = gr.HighlightedText(
|
| 295 |
label="Token Importance (Score: 0-1)",
|
|
@@ -297,9 +287,8 @@ with gr.Blocks(title="AI Sentence Analyzer XAI π", theme=theme, css=".gradio-
|
|
| 297 |
combine_adjacent=False
|
| 298 |
)
|
| 299 |
|
| 300 |
-
# BarPlot and PCA Plot Side-by-Side
|
| 301 |
-
|
| 302 |
-
with gr.Column(scale=1, min_width=400): # Adjusted min_width for BarPlot
|
| 303 |
with gr.Group():
|
| 304 |
gr.Markdown("### π Top-K Bar Plot")
|
| 305 |
output_top_tokens_barplot = gr.BarPlot(
|
|
@@ -307,14 +296,14 @@ with gr.Blocks(title="AI Sentence Analyzer XAI π", theme=theme, css=".gradio-
|
|
| 307 |
x="token",
|
| 308 |
y="score",
|
| 309 |
tooltip=['token', 'score'],
|
| 310 |
-
min_width=300
|
| 311 |
)
|
| 312 |
-
with gr.Column(scale=1, min_width=400):
|
| 313 |
with gr.Group():
|
| 314 |
gr.Markdown("### π Token Embeddings 3D PCA (Interactive)")
|
| 315 |
output_pca_plot = gr.Plot(label="3D PCA of Token Embeddings (Colored by Importance Score)")
|
| 316 |
|
| 317 |
-
gr.Markdown("---")
|
| 318 |
|
| 319 |
gr.Examples(
|
| 320 |
examples=[
|
|
@@ -323,8 +312,8 @@ with gr.Blocks(title="AI Sentence Analyzer XAI π", theme=theme, css=".gradio-
|
|
| 323 |
["I was thoroughly disappointed with the lackluster performance and predictable plot.", 4]
|
| 324 |
],
|
| 325 |
inputs=[input_sentence, input_top_k],
|
| 326 |
-
outputs=[
|
| 327 |
-
|
| 328 |
output_prediction_summary, output_prediction_details,
|
| 329 |
output_top_tokens_df, output_top_tokens_barplot,
|
| 330 |
output_pca_plot
|
|
@@ -337,8 +326,8 @@ with gr.Blocks(title="AI Sentence Analyzer XAI π", theme=theme, css=".gradio-
|
|
| 337 |
submit_button.click(
|
| 338 |
fn=analyze_sentence_for_gradio,
|
| 339 |
inputs=[input_sentence, input_top_k],
|
| 340 |
-
outputs=[
|
| 341 |
-
|
| 342 |
output_prediction_summary, output_prediction_details,
|
| 343 |
output_top_tokens_df, output_top_tokens_barplot,
|
| 344 |
output_pca_plot
|
|
|
|
| 3 |
import joblib
|
| 4 |
import torch
|
| 5 |
import numpy as np
|
| 6 |
+
import html # μ¬μ ν highlighted_text_data μμ± μ html.escapeλ₯Ό μ¬μ©ν μ μμΌλ―λ‘ μ μ§
|
| 7 |
from transformers import AutoTokenizer, AutoModel, logging as hf_logging
|
| 8 |
import pandas as pd
|
| 9 |
import matplotlib
|
|
|
|
| 17 |
|
| 18 |
MODEL_NAME = "bert-base-uncased"
|
| 19 |
DEVICE = "cpu"
|
| 20 |
+
SAVE_DIR = "μ μ₯μ μ₯1"
|
| 21 |
LAYER_ID = 4
|
| 22 |
SEED = 0
|
| 23 |
CLF_NAME = "linear"
|
|
|
|
| 127 |
)
|
| 128 |
return fig
|
| 129 |
|
| 130 |
+
# --- Core Analysis Function (returns 6 items for Gradio UI) ---
|
| 131 |
def analyze_sentence_for_gradio(sentence_text, top_k_value):
|
| 132 |
if not MODELS_LOADED_SUCCESSFULLY:
|
| 133 |
+
# HTML output removed, adjust error return
|
| 134 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 135 |
empty_fig = create_empty_plotly_figure("Model Loading Failed")
|
| 136 |
error_label_output = {"Status": "Error", "Message": "Model Loading Failed. Check logs."}
|
| 137 |
+
return [], "Model Loading Failed", error_label_output, [], empty_df, empty_fig # 6 items
|
| 138 |
|
| 139 |
try:
|
| 140 |
tokenizer, model = TOKENIZER_GLOBAL, MODEL_GLOBAL
|
|
|
|
| 147 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 148 |
empty_fig = create_empty_plotly_figure("Invalid Input")
|
| 149 |
error_label_output = {"Status": "Error", "Message": "Invalid input, no valid tokens."}
|
| 150 |
+
return [], "Input Error", error_label_output, [], empty_df, empty_fig # 6 items
|
| 151 |
|
| 152 |
input_embeds_detached = model.embeddings.word_embeddings(input_ids).clone().detach()
|
| 153 |
input_embeds_for_grad = input_embeds_detached.clone().requires_grad_(True)
|
|
|
|
| 167 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 168 |
empty_fig = create_empty_plotly_figure("Gradient Error")
|
| 169 |
error_label_output = {"Status": "Error", "Message": "Gradient calculation failed."}
|
| 170 |
+
return [],"Analysis Error", error_label_output, [], empty_df, empty_fig # 6 items
|
| 171 |
|
| 172 |
grads = input_embeds_for_grad.grad.clone().detach()
|
| 173 |
scores = (grads * input_embeds_detached).norm(dim=2).squeeze(0)
|
|
|
|
| 180 |
actual_scores_np = scores_np[:len(actual_tokens)]
|
| 181 |
actual_input_embeds = input_embeds_detached[0, :len(actual_tokens), :].cpu().numpy()
|
| 182 |
|
| 183 |
+
# HTML generation logic removed
|
| 184 |
+
highlighted_text_data = []
|
| 185 |
cls_token_id, sep_token_id = tokenizer.cls_token_id, tokenizer.sep_token_id
|
| 186 |
|
| 187 |
for i, tok_str in enumerate(actual_tokens):
|
|
|
|
| 191 |
current_token_id = input_ids[0, i].item()
|
| 192 |
|
| 193 |
if current_token_id == cls_token_id or current_token_id == sep_token_id:
|
|
|
|
| 194 |
highlighted_text_data.append((clean_tok_str + " ", None))
|
| 195 |
else:
|
|
|
|
|
|
|
| 196 |
highlighted_text_data.append((clean_tok_str + " ", round(current_score_clipped, 3)))
|
| 197 |
|
|
|
|
|
|
|
| 198 |
top_tokens_for_df, top_tokens_for_barplot_list = [], []
|
| 199 |
valid_indices = [idx for idx, token_id in enumerate(input_ids[0,:len(actual_tokens)].tolist())
|
| 200 |
if token_id not in [cls_token_id, sep_token_id]]
|
|
|
|
| 226 |
token_embeddings_3d = pca.fit_transform(pca_embeddings)
|
| 227 |
pca_fig = plot_token_pca_3d_plotly(token_embeddings_3d, pca_tokens, pca_scores_for_plot)
|
| 228 |
|
| 229 |
+
return (highlighted_text_data, # HTML output removed
|
| 230 |
prediction_summary_text, prediction_details_for_label,
|
| 231 |
top_tokens_for_df, barplot_df,
|
| 232 |
+
pca_fig) # 6 items
|
| 233 |
|
| 234 |
except Exception as e:
|
| 235 |
import traceback
|
| 236 |
tb_str = traceback.format_exc()
|
| 237 |
+
# HTML output removed
|
| 238 |
print(f"analyze_sentence_for_gradio error: {e}\n{tb_str}")
|
| 239 |
empty_df = pd.DataFrame(columns=['token', 'score'])
|
| 240 |
empty_fig = create_empty_plotly_figure("Analysis Error")
|
| 241 |
error_label_output = {"Status": "Error", "Message": f"Analysis failed: {str(e)}"}
|
| 242 |
+
return [], "Analysis Failed", error_label_output, [], empty_df, empty_fig # 6 items
|
| 243 |
|
| 244 |
+
# --- Gradio UI Definition (HTML Highlight Tab removed) ---
|
| 245 |
theme = gr.themes.Monochrome(
|
| 246 |
primary_hue=gr.themes.colors.blue,
|
| 247 |
secondary_hue=gr.themes.colors.sky,
|
|
|
|
| 258 |
gr.Markdown("Analyze English sentences to understand BERT model predictions through various XAI visualization techniques. "
|
| 259 |
"Explore token importance and their distribution in the embedding space.")
|
| 260 |
|
|
|
|
| 261 |
with gr.Row(equal_height=False):
|
| 262 |
with gr.Column(scale=1, min_width=350):
|
| 263 |
with gr.Group():
|
|
|
|
| 273 |
with gr.Accordion("β Top-K Important Tokens (Table)", open=True):
|
| 274 |
output_top_tokens_df = gr.DataFrame(headers=["Token", "Score"], label="Most Important Tokens",
|
| 275 |
row_count=(1,"dynamic"), col_count=(2,"fixed"), interactive=False, wrap=True)
|
| 276 |
+
gr.Markdown("---")
|
| 277 |
|
|
|
|
| 278 |
gr.Markdown("## π Detailed Visualizations")
|
| 279 |
+
|
| 280 |
+
# HTML Highlight (Custom) section removed
|
| 281 |
|
| 282 |
+
with gr.Group(): # HighlightedText
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
gr.Markdown("### ποΈ Highlighted Text (Gradio)")
|
| 284 |
output_highlighted_text = gr.HighlightedText(
|
| 285 |
label="Token Importance (Score: 0-1)",
|
|
|
|
| 287 |
combine_adjacent=False
|
| 288 |
)
|
| 289 |
|
| 290 |
+
with gr.Row(): # BarPlot and PCA Plot Side-by-Side
|
| 291 |
+
with gr.Column(scale=1, min_width=400):
|
|
|
|
| 292 |
with gr.Group():
|
| 293 |
gr.Markdown("### π Top-K Bar Plot")
|
| 294 |
output_top_tokens_barplot = gr.BarPlot(
|
|
|
|
| 296 |
x="token",
|
| 297 |
y="score",
|
| 298 |
tooltip=['token', 'score'],
|
| 299 |
+
min_width=300
|
| 300 |
)
|
| 301 |
+
with gr.Column(scale=1, min_width=400):
|
| 302 |
with gr.Group():
|
| 303 |
gr.Markdown("### π Token Embeddings 3D PCA (Interactive)")
|
| 304 |
output_pca_plot = gr.Plot(label="3D PCA of Token Embeddings (Colored by Importance Score)")
|
| 305 |
|
| 306 |
+
gr.Markdown("---")
|
| 307 |
|
| 308 |
gr.Examples(
|
| 309 |
examples=[
|
|
|
|
| 312 |
["I was thoroughly disappointed with the lackluster performance and predictable plot.", 4]
|
| 313 |
],
|
| 314 |
inputs=[input_sentence, input_top_k],
|
| 315 |
+
outputs=[ # output_html_visualization removed
|
| 316 |
+
output_highlighted_text,
|
| 317 |
output_prediction_summary, output_prediction_details,
|
| 318 |
output_top_tokens_df, output_top_tokens_barplot,
|
| 319 |
output_pca_plot
|
|
|
|
| 326 |
submit_button.click(
|
| 327 |
fn=analyze_sentence_for_gradio,
|
| 328 |
inputs=[input_sentence, input_top_k],
|
| 329 |
+
outputs=[ # output_html_visualization removed
|
| 330 |
+
output_highlighted_text,
|
| 331 |
output_prediction_summary, output_prediction_details,
|
| 332 |
output_top_tokens_df, output_top_tokens_barplot,
|
| 333 |
output_pca_plot
|