--- license: apache-2.0 language: - en pipeline_tag: image-text-to-text --- # This Model is for Educational Research Purpose Only. # Sample Code ``` %%capture !pip install -U bitsandbytes from transformers import AutoProcessor, AutoModelForVision2Seq import torch DEVICE = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained("manifestasi/smolVLM-161M-q4-manifestasi") model = AutoModelForVision2Seq.from_pretrained("manifestasi/smolVLM-161M-q4-manifestasi", torch_dtype=torch.float16, _attn_implementation="eager").to(DEVICE) from PIL import Image from transformers.image_utils import load_image # Load images # image1 = load_image("https://huggingface.co/spaces/HuggingFaceTB/SmolVLM/resolve/main/example_images/rococo.jpg") image2 = load_image("/kaggle/input/bandaraaa/799269_1200.jpg") # Create input messages messages = [ { "role": "user", "content": [ # {"type": "image"}, {"type": "image"}, {"type": "text", "text": """ Instructions : you are visual assistant for blind people, please answer politely and short under 100 words. Prompt : can you direct me to find toilet """} ] }, ] # Prepare inputs prompt = processor.apply_chat_template(messages, add_generation_prompt=True) # inputs = processor(text=prompt, return_tensors="pt") inputs = processor(text=prompt, images=[image2], return_tensors="pt") inputs = inputs.to(DEVICE) # Generate outputs from time import time tim1 = time() generated_ids = model.generate(**inputs, max_new_tokens=120) generated_texts = processor.batch_decode( generated_ids, skip_special_tokens=True, ) tim2 = time() print(f"{(tim2 - tim1)} detik") print(generated_texts[0].split("Assistant: ")[1]) ```