bparekh99 commited on
Commit
76428a7
·
verified ·
1 Parent(s): a1fa05b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -0
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import gradio as gr
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ # Load SentenceTransformer model
7
+ model = SentenceTransformer('all-MiniLM-L6-v2')
8
+
9
+ def process_text(text):
10
+ # Remove ASCII characters and lowercase
11
+ cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower()
12
+
13
+ # Get token embeddings
14
+ token_embeddings = model.encode(cleaned, output_value='token_embeddings', convert_to_tensor=True)
15
+ tokens = model.tokenizer.tokenize(cleaned)
16
+
17
+ # Pair each token with its embedding (truncated for display)
18
+ result = []
19
+ for token, emb in zip(tokens, token_embeddings):
20
+ result.append([token, str(emb[:5].tolist()) + '...']) # truncate vector
21
+
22
+ return result
23
+
24
+ # Gradio interface
25
+ gr.Interface(
26
+ fn=process_text,
27
+ inputs=gr.Textbox(lines=4, placeholder="Enter text here..."),
28
+ outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]),
29
+ title="SentenceTransformer Token Embeddings",
30
+ description="Removes ASCII, lowercases input, tokenizes and embeds with SentenceTransformer."
31
+ ).launch()