This Model is for Educational Research Purpose Only.
Sample Code
%%capture
!pip install -U bitsandbytes
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("manifestasi/smolVLM-161M-q4-manifestasi")
model = AutoModelForVision2Seq.from_pretrained("manifestasi/smolVLM-161M-q4-manifestasi",
torch_dtype=torch.float16,
_attn_implementation="eager").to(DEVICE)
from PIL import Image
from transformers.image_utils import load_image
# Load images
# image1 = load_image("https://huggingface.co/spaces/HuggingFaceTB/SmolVLM/resolve/main/example_images/rococo.jpg")
image2 = load_image("/kaggle/input/bandaraaa/799269_1200.jpg")
# Create input messages
messages = [
{
"role": "user",
"content": [
# {"type": "image"},
{"type": "image"},
{"type": "text",
"text": """
Instructions :
you are visual assistant for blind people, please answer politely and short
under 100 words.
Prompt :
can you direct me to find toilet
"""}
]
},
]
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, return_tensors="pt")
inputs = processor(text=prompt, images=[image2], return_tensors="pt")
inputs = inputs.to(DEVICE)
# Generate outputs
from time import time
tim1 = time()
generated_ids = model.generate(**inputs, max_new_tokens=120)
generated_texts = processor.batch_decode(
generated_ids,
skip_special_tokens=True,
)
tim2 = time()
print(f"{(tim2 - tim1)} detik")
print(generated_texts[0].split("Assistant: ")[1])
- Downloads last month
- 43
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support