import re import torch import gradio as gr from sentence_transformers import SentenceTransformer # Load SentenceTransformer model model = SentenceTransformer('all-MiniLM-L6-v2') def process_text(text): # Remove ASCII characters and lowercase cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower() # Get token embeddings token_embeddings = model.encode(cleaned, output_value='token_embeddings', convert_to_tensor=True) tokens = model.tokenizer.tokenize(cleaned) # Pair each token with its embedding (truncated for display) result = [] for token, emb in zip(tokens, token_embeddings): result.append([token, str(emb[:5].tolist()) + '...']) # truncate vector return result # Gradio interface gr.Interface( fn=process_text, inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]), title="SentenceTransformer Token Embeddings", description="Removes ASCII, lowercases input, tokenizes and embeds with SentenceTransformer." ).launch()