bparekh99 commited on
Commit
c5cb3d5
·
verified ·
1 Parent(s): b99b3b6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -0
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import gradio as gr
4
+ from transformers import BertTokenizer, BertModel
5
+
6
+ # Load tokenizer and model
7
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
8
+ model = BertModel.from_pretrained('bert-base-uncased')
9
+
10
+ def process_text(text):
11
+ # Remove ASCII characters and lowercase
12
+ cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower()
13
+
14
+ # Tokenize
15
+ inputs = tokenizer(cleaned, return_tensors="pt")
16
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
17
+
18
+ # Get BERT embeddings
19
+ with torch.no_grad():
20
+ outputs = model(**inputs)
21
+ embeddings = outputs.last_hidden_state.squeeze(0) # (seq_len, hidden_size)
22
+
23
+ # Pair each token with its embedding (truncated for display)
24
+ token_embeddings = []
25
+ for token, emb in zip(tokens, embeddings):
26
+ token_embeddings.append([token, str(emb[:5].tolist()) + '...']) # truncate vector for readability
27
+
28
+ return token_embeddings
29
+
30
+ # Gradio interface
31
+ gr.Interface(
32
+ fn=process_text,
33
+ inputs=gr.Textbox(lines=4, placeholder="Enter text here..."),
34
+ outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]),
35
+ title="BERT Tokenizer & Embeddings Viewer",
36
+ description="Removes ASCII characters, lowercases text, tokenizes using BERT, and shows token embeddings."
37
+ ).launch()