Spaces:
Sleeping
Sleeping
import re | |
import torch | |
import gradio as gr | |
from transformers import BertTokenizer, BertModel | |
# Load tokenizer and model | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
model = BertModel.from_pretrained('bert-base-uncased') | |
def process_text(text): | |
# Remove ASCII characters and lowercase | |
cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower() | |
# Tokenize | |
inputs = tokenizer(cleaned, return_tensors="pt") | |
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
# Get BERT embeddings | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.squeeze(0) # (seq_len, hidden_size) | |
# Pair each token with its embedding (truncated for display) | |
token_embeddings = [] | |
for token, emb in zip(tokens, embeddings): | |
token_embeddings.append([token, str(emb[:5].tolist()) + '...']) # truncate vector for readability | |
return token_embeddings | |
# Gradio interface | |
gr.Interface( | |
fn=process_text, | |
inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), | |
outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]), | |
title="BERT Tokenizer & Embeddings Viewer", | |
description="Removes ASCII characters, lowercases text, tokenizes using BERT, and shows token embeddings." | |
).launch() |