|
--- |
|
license: apache-2.0 |
|
language: |
|
- en |
|
pipeline_tag: image-text-to-text |
|
--- |
|
# This Model is for Educational Research Purpose Only. |
|
|
|
# Sample Code |
|
|
|
``` |
|
%%capture |
|
!pip install -U bitsandbytes |
|
|
|
from transformers import AutoProcessor, AutoModelForVision2Seq |
|
import torch |
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
processor = AutoProcessor.from_pretrained("manifestasi/smolVLM-161M-q4-manifestasi") |
|
model = AutoModelForVision2Seq.from_pretrained("manifestasi/smolVLM-161M-q4-manifestasi", |
|
torch_dtype=torch.float16, |
|
_attn_implementation="eager").to(DEVICE) |
|
|
|
from PIL import Image |
|
from transformers.image_utils import load_image |
|
|
|
|
|
# Load images |
|
# image1 = load_image("https://huggingface.co/spaces/HuggingFaceTB/SmolVLM/resolve/main/example_images/rococo.jpg") |
|
image2 = load_image("/kaggle/input/bandaraaa/799269_1200.jpg") |
|
|
|
# Create input messages |
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
# {"type": "image"}, |
|
{"type": "image"}, |
|
{"type": "text", |
|
"text": """ |
|
Instructions : |
|
you are visual assistant for blind people, please answer politely and short |
|
under 100 words. |
|
Prompt : |
|
can you direct me to find toilet |
|
"""} |
|
] |
|
}, |
|
] |
|
|
|
# Prepare inputs |
|
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
# inputs = processor(text=prompt, return_tensors="pt") |
|
inputs = processor(text=prompt, images=[image2], return_tensors="pt") |
|
inputs = inputs.to(DEVICE) |
|
# Generate outputs |
|
from time import time |
|
|
|
tim1 = time() |
|
generated_ids = model.generate(**inputs, max_new_tokens=120) |
|
generated_texts = processor.batch_decode( |
|
generated_ids, |
|
skip_special_tokens=True, |
|
) |
|
tim2 = time() |
|
print(f"{(tim2 - tim1)} detik") |
|
print(generated_texts[0].split("Assistant: ")[1]) |
|
|
|
``` |