|
<!doctype html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="utf-8" /> |
|
<meta name="viewport" content="width=device-width, initial-scale=1" /> |
|
<title>TokenVisualizer — Minimal</title> |
|
<link rel="preconnect" href="https://fonts.googleapis.com"> |
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet"> |
|
<style> |
|
:root{ |
|
--bg:#0b0f14; --text:#ffffff; --muted:#9aa4b2; --accent:#38bdf8; --border:#1f2a3a; |
|
--card1:#0c1624; --card2:#0a1220; --chip:#111827; --chip-border:#263246; --chip-hover:#1a2434; |
|
--mono:'JetBrains Mono',ui-monospace,Menlo,Consolas,monospace; --sans:Inter,system-ui,-apple-system,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",Arial; |
|
} |
|
*{box-sizing:border-box} body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)} |
|
.container{max-width:1100px;margin:0 auto;padding:1.25rem} |
|
header{padding-top:1.5rem} h1{margin:.2rem 0 .4rem;font-size:1.9rem} |
|
.sub{color:var(--muted);margin:.25rem 0 1rem} |
|
.card{background:linear-gradient(180deg,var(--card1),var(--card2));border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)} |
|
label span{color:var(--muted);font-size:.9rem} |
|
select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none} |
|
select:focus,textarea:focus{border-color:var(--accent)} |
|
.controls{display:grid;gap:.8rem;margin-bottom:1rem} |
|
.row{display:flex;gap:.75rem;align-items:center} |
|
.status{color:var(--muted)} |
|
.grid{display:grid;gap:1rem;grid-template-columns:1fr} |
|
@media (min-width:900px){.grid{grid-template-columns:1fr 1fr}} |
|
.head{display:flex;align-items:center;justify-content:space-between;margin-bottom:.5rem} |
|
.tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem} |
|
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s} |
|
.chip:hover{background:var(--chip-hover);border-color:var(--accent)} |
|
.chip.active{outline:2px solid var(--accent)} |
|
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap} |
|
.caption{color:var(--muted);font-size:.9rem;margin-top:.5rem} |
|
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem} |
|
a{color:var(--accent)} |
|
</style> |
|
</head> |
|
<body> |
|
<header class="container"> |
|
<h1>TokenVisualizer</h1> |
|
<p class="sub">Live view of tokens and token IDs. Powered by Transformers.js — all in your browser.</p> |
|
</header> |
|
|
|
<main class="container"> |
|
<section class="card controls"> |
|
<label> |
|
<span>Model</span> |
|
<select id="model"> |
|
|
|
<option value="local:gpt2">GPT-2 (local, fast)</option> |
|
<option value="Xenova/gpt2">GPT-2 (Hub)</option> |
|
<option value="Xenova/llama2-tokenizer">Llama-2 (Hub)</option> |
|
<option value="Xenova/mistral-tokenizer">Mistral (Hub)</option> |
|
<option value="Xenova/gemma-tokenizer">Gemma (Hub)</option> |
|
<option value="Xenova/bert-base-uncased">BERT Base Uncased (Hub)</option> |
|
</select> |
|
</label> |
|
<label> |
|
<span>Text</span> |
|
<textarea id="input" rows="3">Hello world! This is a tokenizer demo.</textarea> |
|
</label> |
|
<div class="row"> |
|
<span id="status" class="status">Loading tokenizer…</span> |
|
</div> |
|
</section> |
|
|
|
<section class="grid"> |
|
<article class="card"> |
|
<div class="head"><h3>Tokens</h3></div> |
|
<div id="tokens" class="tokens"></div> |
|
<p class="caption">Tokens are subword chunks the model learned from lots of text.</p> |
|
</article> |
|
|
|
<article class="card"> |
|
<div class="head"><h3>Token IDs</h3></div> |
|
<pre id="ids" class="ids"></pre> |
|
<p class="caption">IDs are how the model “sees” tokens — just numbers.</p> |
|
</article> |
|
</section> |
|
</main> |
|
|
|
<footer class="container"> |
|
<small>Built by Peter Adams • Powered by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small> |
|
</footer> |
|
|
|
|
|
<script type="module"> |
|
|
|
|
|
const tf = await import('./assets/vendor/transformers.min.js'); |
|
|
|
|
|
|
|
tf.env.useBrowserCache = true; |
|
tf.env.allowLocalModels = true; |
|
|
|
const $ = s => document.querySelector(s); |
|
const modelSel = $('#model'); |
|
const inputEl = $('#input'); |
|
const statusEl = $('#status'); |
|
const tokensEl = $('#tokens'); |
|
const idsEl = $('#ids'); |
|
|
|
|
|
const state = { tokens: [], ids: [] }; |
|
let tokenizer = null; |
|
let runId = 0; |
|
|
|
const status = (msg) => { statusEl.textContent = msg; }; |
|
const debounce = (fn, ms=200) => { let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; }; |
|
|
|
async function loadTokenizer(modelId){ |
|
status('Loading tokenizer…'); |
|
try { |
|
if (modelId === 'local:gpt2') { |
|
|
|
tokenizer = await tf.AutoTokenizer.from_pretrained('./assets/gpt2/'); |
|
} else { |
|
tokenizer = await tf.AutoTokenizer.from_pretrained(modelId); |
|
} |
|
status('Tokenizer ready.'); |
|
} catch (e) { |
|
console.error('Tokenizer load failed:', e); |
|
tokenizer = null; |
|
status('Failed to load tokenizer (network blocked or slow). Try GPT-2 or a different VPN route.'); |
|
} |
|
} |
|
|
|
async function tokenize(){ |
|
const myRun = ++runId; |
|
if (!tokenizer) { |
|
await loadTokenizer(modelSel.value); |
|
if (!tokenizer) { render(); return; } |
|
} |
|
|
|
const text = (inputEl.value ?? '').trim(); |
|
if (!text) { |
|
state.tokens = []; |
|
state.ids = []; |
|
render(); |
|
status('Type to tokenize…'); |
|
return; |
|
} |
|
|
|
status('Tokenizing…'); |
|
try { |
|
const enc = await tokenizer.encode(text); |
|
if (myRun !== runId) return; |
|
state.tokens = Array.isArray(enc?.tokens) ? enc.tokens : []; |
|
state.ids = Array.isArray(enc?.ids) ? enc.ids : []; |
|
render(); |
|
status(`Done. ${state.tokens.length} tokens.`); |
|
} catch (e) { |
|
console.error('Tokenize failed:', e); |
|
state.tokens = state.tokens ?? []; |
|
state.ids = state.ids ?? []; |
|
render(); |
|
status('Error tokenizing. See console.'); |
|
} |
|
} |
|
|
|
function render(){ |
|
const tokens = Array.isArray(state.tokens) ? state.tokens : []; |
|
const ids = Array.isArray(state.ids) ? state.ids : []; |
|
|
|
|
|
tokensEl.innerHTML = ''; |
|
tokens.forEach((tok, i) => { |
|
const chip = document.createElement('span'); |
|
chip.className = 'chip'; |
|
chip.dataset.i = i; |
|
chip.textContent = tok; |
|
chip.addEventListener('mouseenter', ()=>highlight(i, true)); |
|
chip.addEventListener('mouseleave', ()=>highlight(i, false)); |
|
tokensEl.appendChild(chip); |
|
}); |
|
|
|
|
|
idsEl.textContent = ids.join(' '); |
|
|
|
if (tokens.length === 0) status('Type to tokenize…'); |
|
} |
|
|
|
function highlight(i, on){ |
|
const ids = Array.isArray(state.ids) ? state.ids : []; |
|
if (!ids.length) return; |
|
|
|
const parts = ids.map((id, idx) => (idx === i && on) ? `[${id}]` : String(id)); |
|
idsEl.textContent = parts.join(' '); |
|
|
|
const chip = tokensEl.querySelector(`[data-i="${i}"]`); |
|
if (chip) chip.classList.toggle('active', on); |
|
} |
|
|
|
const debounced = debounce(tokenize, 200); |
|
inputEl.addEventListener('input', debounced); |
|
|
|
modelSel.addEventListener('change', async ()=>{ |
|
tokenizer = null; |
|
await loadTokenizer(modelSel.value); |
|
tokenize(); |
|
}); |
|
|
|
|
|
await loadTokenizer(modelSel.value); |
|
tokenize(); |
|
</script> |
|
</body> |
|
</html> |
|
|