Update README.md
Browse files
README.md
CHANGED
@@ -38,11 +38,23 @@ A Byte-Pair Encoding (BPE) tokenizer trained on over **3.4 lakh cleaned Telugu t
|
|
38 |
```python
|
39 |
from transformers import T5Tokenizer
|
40 |
|
|
|
41 |
tokenizer = T5Tokenizer.from_pretrained("Vipplav/telugu-bpe-23k")
|
42 |
|
43 |
-
|
|
|
|
|
|
|
44 |
tokens = tokenizer.tokenize(text)
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
```
|
47 |
|
48 |
|
|
|
38 |
```python
|
39 |
from transformers import T5Tokenizer
|
40 |
|
41 |
+
# Load tokenizer from Hugging Face Hub
|
42 |
tokenizer = T5Tokenizer.from_pretrained("Vipplav/telugu-bpe-23k")
|
43 |
|
44 |
+
# Sample Telugu input
|
45 |
+
text = "పరిశీలన తేదీ: 15-06-2025"
|
46 |
+
|
47 |
+
# Tokenize the input
|
48 |
tokens = tokenizer.tokenize(text)
|
49 |
+
|
50 |
+
# Decode tokens back to text
|
51 |
+
decoded = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True)
|
52 |
+
|
53 |
+
# Display results
|
54 |
+
print(f"\n📥 Input : {text}")
|
55 |
+
print(f"🔤 Tokens : {tokens}")
|
56 |
+
print(f"📝 Decoded : {decoded}")
|
57 |
+
|
58 |
```
|
59 |
|
60 |
|