bparekh99's picture
Create app.py
76428a7 verified
import re
import torch
import gradio as gr
from sentence_transformers import SentenceTransformer
# Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
def process_text(text):
# Remove ASCII characters and lowercase
cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower()
# Get token embeddings
token_embeddings = model.encode(cleaned, output_value='token_embeddings', convert_to_tensor=True)
tokens = model.tokenizer.tokenize(cleaned)
# Pair each token with its embedding (truncated for display)
result = []
for token, emb in zip(tokens, token_embeddings):
result.append([token, str(emb[:5].tolist()) + '...']) # truncate vector
return result
# Gradio interface
gr.Interface(
fn=process_text,
inputs=gr.Textbox(lines=4, placeholder="Enter text here..."),
outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]),
title="SentenceTransformer Token Embeddings",
description="Removes ASCII, lowercases input, tokenizes and embeds with SentenceTransformer."
).launch()