BERT_example / app.py
bparekh99's picture
Create app.py
c5cb3d5 verified
import re
import torch
import gradio as gr
from transformers import BertTokenizer, BertModel
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def process_text(text):
# Remove ASCII characters and lowercase
cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower()
# Tokenize
inputs = tokenizer(cleaned, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# Get BERT embeddings
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.squeeze(0) # (seq_len, hidden_size)
# Pair each token with its embedding (truncated for display)
token_embeddings = []
for token, emb in zip(tokens, embeddings):
token_embeddings.append([token, str(emb[:5].tolist()) + '...']) # truncate vector for readability
return token_embeddings
# Gradio interface
gr.Interface(
fn=process_text,
inputs=gr.Textbox(lines=4, placeholder="Enter text here..."),
outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]),
title="BERT Tokenizer & Embeddings Viewer",
description="Removes ASCII characters, lowercases text, tokenizes using BERT, and shows token embeddings."
).launch()