import re import torch import gradio as gr from transformers import BertTokenizer, BertModel # Load tokenizer and model tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') def process_text(text): # Remove ASCII characters and lowercase cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower() # Tokenize inputs = tokenizer(cleaned, return_tensors="pt") tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) # Get BERT embeddings with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.squeeze(0) # (seq_len, hidden_size) # Pair each token with its embedding (truncated for display) token_embeddings = [] for token, emb in zip(tokens, embeddings): token_embeddings.append([token, str(emb[:5].tolist()) + '...']) # truncate vector for readability return token_embeddings # Gradio interface gr.Interface( fn=process_text, inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]), title="BERT Tokenizer & Embeddings Viewer", description="Removes ASCII characters, lowercases text, tokenizes using BERT, and shows token embeddings." ).launch()