VLM / index.html
BK-V's picture
change ui style to be more modern
e75589a
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Camera Interaction App</title>
<link
href="https://fonts.googleapis.com/css2?family=Vazirmatn:wght@400;600;700&display=swap"
rel="stylesheet"
/>
<link
rel="stylesheet"
href="https://fonts.googleapis.com/icon?family=Material+Icons"
/>
<style>
:root {
--primary-color: #6200ee;
--primary-variant: #3700b3;
--secondary-color: #03dac6;
--background-color: #fff;
--surface-color: #fff;
--error-color: #b00020;
--text-primary: #212121;
--text-secondary: #757575;
--disabled-color: #bdbdbd;
--elevation-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
--border-radius: 4px;
--font-family: 'Vazirmatn', 'Inter', 'Segoe UI', Arial, sans-serif;
}
body {
font-family: var(--font-family);
background-color: var(--background-color);
color: var(--text-primary);
margin: 0;
padding: 0;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
min-height: 100vh;
direction: rtl;
}
.container {
width: 90%;
max-width: 800px;
padding: 24px;
box-sizing: border-box;
}
.header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 24px;
}
.header h1 {
font-size: 1.5rem;
font-weight: 500;
color: var(--primary-color);
margin: 0;
}
.video-wrapper {
position: relative;
width: 100%;
aspect-ratio: 4 / 3;
border-radius: var(--border-radius);
overflow: hidden;
box-shadow: var(--elevation-shadow);
margin-bottom: 24px;
}
#videoFeed {
width: 100%;
height: 100%;
object-fit: cover;
display: block;
}
#loadingOverlay {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.5);
color: white;
display: none;
justify-content: center;
align-items: center;
font-size: 1.2rem;
}
.input-group {
margin-bottom: 16px;
}
.input-group label {
display: block;
margin-bottom: 8px;
color: var(--text-secondary);
}
.input-group textarea {
width: 100%;
padding: 12px;
border: 1px solid var(--disabled-color);
border-radius: var(--border-radius);
box-sizing: border-box;
font-family: inherit;
font-size: 1rem;
resize: none;
text-align: right;
}
.controls {
display: flex;
flex-direction: column;
gap: 16px;
}
.select-wrapper {
position: relative;
display: flex;
align-items: center;
border: 1px solid var(--disabled-color);
border-radius: var(--border-radius);
overflow: hidden;
background-color: var(--surface-color);
}
.select-wrapper select {
padding: 12px 40px 12px 12px;
border: none;
background-color: transparent;
font-family: inherit;
font-size: 1rem;
color: var(--text-primary);
appearance: none;
text-align: right;
flex: 1;
}
.select-wrapper .material-icons {
position: absolute;
left: 12px;
color: var(--text-secondary);
pointer-events: none;
}
.button {
padding: 12px 24px;
border: none;
border-radius: var(--border-radius);
font-family: inherit;
font-size: 1rem;
font-weight: 500;
text-transform: uppercase;
cursor: pointer;
box-shadow: var(--elevation-shadow);
transition: background-color 0.3s;
}
.button.primary {
background-color: var(--primary-color);
color: white;
}
.button.primary:hover {
background-color: var(--primary-variant);
}
.hidden {
display: none;
}
/* Utility Classes */
.mt-2 {
margin-top: 16px;
}
.mb-2 {
margin-bottom: 16px;
}
</style>
</head>
<body>
<div class="container">
<header class="header">
<h1>مدل زبانی-بصری فارسی</h1>
</header>
<div class="video-wrapper">
<video id="videoFeed" autoplay playsinline></video>
<div id="loadingOverlay">در حال بارگذاری...</div>
</div>
<canvas id="canvas" class="hidden"></canvas>
<div class="input-group">
<label for="responseText">پاسخ:</label>
<textarea
id="responseText"
rows="4"
readonly
placeholder="پاسخ سرور اینجا نمایش داده می‌شود..."
></textarea>
</div>
<div class="controls">
<div class="select-wrapper mb-2">
<select id="intervalSelect">
<option value="0">۰ میلی‌ثانیه</option>
<option value="100">۱۰۰ میلی‌ثانیه</option>
<option value="250">۲۵۰ میلی‌ثانیه</option>
<option value="500">۵۰۰ میلی‌ثانیه</option>
<option value="1000">۱ ثانیه</option>
<option value="2000">۲ ثانیه</option>
</select>
<i class="material-icons">arrow_drop_down</i>
</div>
<button id="startButton" class="button primary">شروع</button>
</div>
</div>
<script type="module">
import {
AutoProcessor,
AutoModelForVision2Seq,
RawImage,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js';
import OpenAI from 'https://cdn.jsdelivr.net/npm/[email protected]/+esm';
const baseURL = 'https://api.avalai.ir/v1';
const openai = new OpenAI({
apiKey: 'aa-H6NlUS0RP0RWYcNgh0eAIhsl0tBxJ1vgw4xG9M3HdFhXIS3h',
baseURL: baseURL,
dangerouslyAllowBrowser: true,
});
const video = document.getElementById('videoFeed');
const canvas = document.getElementById('canvas');
const responseText = document.getElementById('responseText');
const intervalSelect = document.getElementById('intervalSelect');
const startButton = document.getElementById('startButton');
const loadingOverlay = document.getElementById('loadingOverlay');
const CONTEXT = `
Translate the text into persian and only return the translated text without any other text.
`;
let stream;
let isProcessing = false;
let processor, model;
async function initModel() {
const modelId = 'HuggingFaceTB/SmolVLM-500M-Instruct'; // or "HuggingFaceTB/SmolVLM-Instruct";
loadingOverlay.style.display = 'flex';
responseText.value = 'Loading processor...';
processor = await AutoProcessor.from_pretrained(modelId);
responseText.value = 'Processor loaded. Loading model...';
model = await AutoModelForVision2Seq.from_pretrained(modelId, {
dtype: {
embed_tokens: 'fp16',
vision_encoder: 'q4',
decoder_model_merged: 'q4',
},
device: 'webgpu',
});
responseText.value = 'Model loaded. Initializing camera...';
loadingOverlay.style.display = 'none';
}
async function initCamera() {
try {
stream = await navigator.mediaDevices.getUserMedia({
video: true,
audio: false,
});
video.srcObject = stream;
responseText.value = 'Camera access granted. Ready to start.';
} catch (err) {
console.error('Error accessing camera:', err);
responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
alert(
`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`
);
}
}
function captureImage() {
if (!stream || !video.videoWidth) {
console.warn('Video stream not ready for capture.');
return null;
}
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const context = canvas.getContext('2d', {
willReadFrequently: true,
});
context.drawImage(video, 0, 0, canvas.width, canvas.height);
const frame = context.getImageData(
0,
0,
canvas.width,
canvas.height
);
return new RawImage(frame.data, frame.width, frame.height, 4);
}
async function runLocalVisionInference(imgElement, instruction) {
const messages = [
{
role: 'user',
content: [{ type: 'image' }, { type: 'text', text: instruction }],
},
];
const text = processor.apply_chat_template(messages, {
add_generation_prompt: true,
});
const inputs = await processor(text, [imgElement], {
do_image_splitting: false,
});
const generatedIds = await model.generate({
...inputs,
max_new_tokens: 100,
});
const output = processor.batch_decode(
generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true }
);
return output[0].trim();
}
async function callExternalLLmAPI(text) {
let response = await fetch(
'https://openrouter.ai/api/v1/chat/completions',
{
method: 'POST',
headers: {
Authorization:
'Bearer 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',
'HTTP-Referer': '<YOUR_SITE_URL>', // Optional. Site URL for rankings on openrouter.ai.
'X-Title': '<YOUR_SITE_NAME>', // Optional. Site title for rankings on openrouter.ai.
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: 'qwen/qwen-2.5-72b-instruct:free',
messages: [
{
role: 'system',
content: CONTEXT,
},
{
role: 'user',
content: text,
},
],
}),
}
);
if (!response.ok) {
throw new Error(`HTTP error! Status: ${response.status}`);
}
const data = await response.json();
const generatedText = data.choices[0].message.content;
return generatedText;
}
async function callExternalLLmAPI2(text) {
const response = await openai.chat.completions.create({
messages: [
{ role: 'system', content: CONTEXT },
{ role: 'user', content: text },
],
model: 'gpt-4o',
});
let generatedText = response.choices[0].message.content;
generatedText = generatedText.trim();
return generatedText;
}
async function sendData() {
if (!isProcessing) return;
const instruction = 'What do you see?';
const rawImg = captureImage();
if (!rawImg) {
responseText.value = 'Capture failed';
return;
}
try {
const reply = await runLocalVisionInference(rawImg, instruction);
const translatedReply = await callExternalLLmAPI2(reply);
responseText.value = translatedReply;
} catch (e) {
console.error(e);
responseText.value = `Error: ${e.message}`;
}
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function processingLoop() {
const intervalMs = parseInt(intervalSelect.value, 10);
while (isProcessing) {
await sendData();
if (!isProcessing) break;
await sleep(intervalMs);
}
}
function handleStart() {
if (!stream) {
responseText.value = 'Camera not available. Cannot start.';
alert('Camera not available. Please grant permission first.');
return;
}
isProcessing = true;
startButton.textContent = 'توقف';
startButton.classList.add('running');
startButton.classList.remove('primary');
intervalSelect.disabled = true;
responseText.value = 'Processing started...';
processingLoop();
}
function handleStop() {
isProcessing = false;
startButton.textContent = 'شروع';
startButton.classList.remove('running');
startButton.classList.add('primary');
intervalSelect.disabled = false;
if (responseText.value.startsWith('Processing started...')) {
responseText.value = 'Processing stopped.';
}
}
startButton.addEventListener('click', () => {
if (isProcessing) {
handleStop();
} else {
handleStart();
}
});
window.addEventListener('DOMContentLoaded', async () => {
if (!navigator.gpu) {
const videoElement = document.getElementById('videoFeed');
const warningElement = document.createElement('p');
warningElement.textContent = 'WebGPU is not available in this browser.';
warningElement.style.color = 'red';
warningElement.style.textAlign = 'center';
videoElement.parentNode.insertBefore(
warningElement,
videoElement.nextSibling
);
}
await initModel();
await initCamera();
responseText.placeholder = 'پاسخ سرور اینجا نمایش داده می‌شود...';
startButton.textContent = isProcessing ? 'توقف' : 'شروع';
});
window.addEventListener('beforeunload', () => {
if (stream) {
stream.getTracks().forEach(track => track.stop());
}
});
</script>
</body>
</html>