Spaces:
Sleeping
Sleeping
import re | |
import torch | |
import gradio as gr | |
from sentence_transformers import SentenceTransformer | |
# Load SentenceTransformer model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def process_text(text): | |
# Remove ASCII characters and lowercase | |
cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower() | |
# Get token embeddings | |
token_embeddings = model.encode(cleaned, output_value='token_embeddings', convert_to_tensor=True) | |
tokens = model.tokenizer.tokenize(cleaned) | |
# Pair each token with its embedding (truncated for display) | |
result = [] | |
for token, emb in zip(tokens, token_embeddings): | |
result.append([token, str(emb[:5].tolist()) + '...']) # truncate vector | |
return result | |
# Gradio interface | |
gr.Interface( | |
fn=process_text, | |
inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), | |
outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]), | |
title="SentenceTransformer Token Embeddings", | |
description="Removes ASCII, lowercases input, tokenizes and embeds with SentenceTransformer." | |
).launch() |