google
/

siglip2-base-patch16-224

@@ -26,6 +26,8 @@ Here is how to use this model to perform zero-shot image classification:
 ```python
 from transformers import pipeline
 # load pipeline
 ckpt = "google/siglip2-base-patch16-224"
@@ -33,11 +35,13 @@ image_classifier = pipeline(model=ckpt, task="zero-shot-image-classification")
 # load image and candidate labels
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 candidate_labels = ["2 cats", "a plane", "a remote"]
 # run inference
-outputs = image_classifier(image, candidate_labels)
 print(outputs)
 ```
 You can encode an image using the Vision Tower like so:

 ```python
 from transformers import pipeline
+from urllib.request import urlopen
+from PIL import Image
 # load pipeline
 ckpt = "google/siglip2-base-patch16-224"
 # load image and candidate labels
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(urlopen(url))
 candidate_labels = ["2 cats", "a plane", "a remote"]
 # run inference
+outputs = image_classifier(image, candidate_labels=candidate_labels)
 print(outputs)
+# [{'score': 0.17189568281173706, 'label': '2 cats'}, {'score': 0.02414962463080883, 'label': 'a remote'}, {'score': 2.1914941044087755e-06, 'label': 'a plane'}]
 ```
 You can encode an image using the Vision Tower like so: