An image captioning model, based on bert-tiny and vit-small, weighing only 100mb!

Works very fast on CPU.

from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
import requests, time
from PIL import Image

model_path = "cnmoro/tiny-image-captioning"

# load the image captioning model and corresponding tokenizer and image processor
model = VisionEncoderDecoderModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
image_processor = AutoImageProcessor.from_pretrained(model_path)

# preprocess an image
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/New_york_times_square-terabass.jpg/800px-New_york_times_square-terabass.jpg"
image = Image.open(requests.get(url, stream=True).raw)
pixel_values = image_processor(image, return_tensors="pt").pixel_values

start = time.time()

# generate caption - suggested settings
generated_ids = model.generate(
    pixel_values,
    temperature=0.7,
    top_p=0.8,
    top_k=50,
    num_beams=3 # you can use 1 for even faster inference with a small drop in quality
)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

end = time.time()

print(generated_text)
# a group of people walking in the middle of a city.

print(f"Time taken: {end - start} seconds")
# Time taken: 0.11215853691101074 seconds
# on CPU !
Downloads last month
366
Safetensors
Model size
26.4M params
Tensor type
F32
ยท
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.

Model tree for cnmoro/tiny-image-captioning

Finetuned
(16)
this model

Spaces using cnmoro/tiny-image-captioning 2