Update README.md
Browse files
README.md
CHANGED
@@ -7,15 +7,15 @@ library_name: birder
|
|
7 |
license: apache-2.0
|
8 |
---
|
9 |
|
10 |
-
# Model Card for
|
11 |
|
12 |
-
A
|
13 |
|
14 |
## Model Details
|
15 |
|
16 |
- **Model Type:** Image encoder
|
17 |
- **Model Stats:**
|
18 |
-
- Params (M):
|
19 |
- Input image size: 224 x 224
|
20 |
- **Dataset:** Trained on a diverse dataset of approximately 11M images, including:
|
21 |
- iNaturalist 2021 (~3.3M)
|
@@ -30,7 +30,6 @@ A ViTReg4-B16 image encoder pre-trained using Masked Image Modeling (MIM). This
|
|
30 |
|
31 |
- **Papers:**
|
32 |
- An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale: <https://arxiv.org/abs/2010.11929>
|
33 |
-
- Vision Transformers Need Registers: <https://arxiv.org/abs/2309.16588>
|
34 |
- Masked Autoencoders Are Scalable Vision Learners: <https://arxiv.org/abs/2111.06377>
|
35 |
|
36 |
## Model Usage
|
@@ -42,75 +41,41 @@ import torch
|
|
42 |
import birder
|
43 |
from PIL import Image
|
44 |
|
45 |
-
(net,
|
46 |
|
47 |
# Get the image size the model was trained on
|
48 |
-
size = birder.get_size_from_signature(signature)
|
49 |
|
50 |
# Create an inference transform
|
51 |
-
transform = birder.classification_transform(size, rgb_stats)
|
52 |
|
53 |
image = Image.open("path/to/image.jpeg")
|
54 |
input_tensor = transform(image).unsqueeze(dim=0)
|
55 |
with torch.inference_mode():
|
56 |
embedding = net.embedding(input_tensor)
|
57 |
-
# embedding is a tensor with shape of (1,
|
58 |
-
```
|
59 |
-
|
60 |
-
Alternatively using `load_model_with_cfg` function
|
61 |
-
|
62 |
-
```python
|
63 |
-
import torch
|
64 |
-
import birder
|
65 |
-
from PIL import Image
|
66 |
-
|
67 |
-
# Must first download the model files
|
68 |
-
(net, cfg) = birder.load_model_with_cfg("models/vitreg4_b16_mim.json", "models/vitreg4_b16_mim_300.pt")
|
69 |
-
net.eval()
|
70 |
-
|
71 |
-
# Get the image size the model was trained on
|
72 |
-
size = birder.get_size_from_signature(cfg["signature"])
|
73 |
-
|
74 |
-
# Create an inference transform
|
75 |
-
transform = birder.classification_transform(size, cfg["rgb_stats"])
|
76 |
-
|
77 |
-
image = Image.open("path/to/image.jpeg")
|
78 |
-
input_tensor = transform(image).unsqueeze(dim=0)
|
79 |
-
with torch.inference_mode():
|
80 |
-
embedding = net.embedding(input_tensor)
|
81 |
-
# embedding is a tensor with shape of (1, embedding_size)
|
82 |
```
|
83 |
|
84 |
## Citation
|
85 |
|
86 |
```bibtex
|
87 |
@misc{dosovitskiy2021imageworth16x16words,
|
88 |
-
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
|
89 |
author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
|
90 |
year={2021},
|
91 |
eprint={2010.11929},
|
92 |
archivePrefix={arXiv},
|
93 |
primaryClass={cs.CV},
|
94 |
-
url={https://arxiv.org/abs/2010.11929},
|
95 |
-
}
|
96 |
-
|
97 |
-
@misc{darcet2024visiontransformersneedregisters,
|
98 |
-
title={Vision Transformers Need Registers},
|
99 |
-
author={Timothée Darcet and Maxime Oquab and Julien Mairal and Piotr Bojanowski},
|
100 |
-
year={2024},
|
101 |
-
eprint={2309.16588},
|
102 |
-
archivePrefix={arXiv},
|
103 |
-
primaryClass={cs.CV},
|
104 |
-
url={https://arxiv.org/abs/2309.16588},
|
105 |
}
|
106 |
|
107 |
@misc{he2021maskedautoencodersscalablevision,
|
108 |
-
title={Masked Autoencoders Are Scalable Vision Learners},
|
109 |
author={Kaiming He and Xinlei Chen and Saining Xie and Yanghao Li and Piotr Dollár and Ross Girshick},
|
110 |
year={2021},
|
111 |
eprint={2111.06377},
|
112 |
archivePrefix={arXiv},
|
113 |
primaryClass={cs.CV},
|
114 |
-
url={https://arxiv.org/abs/2111.06377},
|
115 |
}
|
116 |
```
|
|
|
7 |
license: apache-2.0
|
8 |
---
|
9 |
|
10 |
+
# Model Card for vit_l16_mim
|
11 |
|
12 |
+
A ViT-L16 image encoder pre-trained using Masked Image Modeling (MIM). This model has *not* been fine-tuned for a specific classification task and is intended to be used as a general-purpose feature extractor or a backbone for downstream tasks like object detection, segmentation, or custom classification.
|
13 |
|
14 |
## Model Details
|
15 |
|
16 |
- **Model Type:** Image encoder
|
17 |
- **Model Stats:**
|
18 |
+
- Params (M): 303.3
|
19 |
- Input image size: 224 x 224
|
20 |
- **Dataset:** Trained on a diverse dataset of approximately 11M images, including:
|
21 |
- iNaturalist 2021 (~3.3M)
|
|
|
30 |
|
31 |
- **Papers:**
|
32 |
- An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale: <https://arxiv.org/abs/2010.11929>
|
|
|
33 |
- Masked Autoencoders Are Scalable Vision Learners: <https://arxiv.org/abs/2111.06377>
|
34 |
|
35 |
## Model Usage
|
|
|
41 |
import birder
|
42 |
from PIL import Image
|
43 |
|
44 |
+
(net, model_info) = birder.load_pretrained_model("vit_l16_mim_400", inference=True)
|
45 |
|
46 |
# Get the image size the model was trained on
|
47 |
+
size = birder.get_size_from_signature(model_info.signature)
|
48 |
|
49 |
# Create an inference transform
|
50 |
+
transform = birder.classification_transform(size, model_info.rgb_stats)
|
51 |
|
52 |
image = Image.open("path/to/image.jpeg")
|
53 |
input_tensor = transform(image).unsqueeze(dim=0)
|
54 |
with torch.inference_mode():
|
55 |
embedding = net.embedding(input_tensor)
|
56 |
+
# embedding is a tensor with shape of (1, 1024)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
```
|
58 |
|
59 |
## Citation
|
60 |
|
61 |
```bibtex
|
62 |
@misc{dosovitskiy2021imageworth16x16words,
|
63 |
+
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
|
64 |
author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
|
65 |
year={2021},
|
66 |
eprint={2010.11929},
|
67 |
archivePrefix={arXiv},
|
68 |
primaryClass={cs.CV},
|
69 |
+
url={https://arxiv.org/abs/2010.11929},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
}
|
71 |
|
72 |
@misc{he2021maskedautoencodersscalablevision,
|
73 |
+
title={Masked Autoencoders Are Scalable Vision Learners},
|
74 |
author={Kaiming He and Xinlei Chen and Saining Xie and Yanghao Li and Piotr Dollár and Ross Girshick},
|
75 |
year={2021},
|
76 |
eprint={2111.06377},
|
77 |
archivePrefix={arXiv},
|
78 |
primaryClass={cs.CV},
|
79 |
+
url={https://arxiv.org/abs/2111.06377},
|
80 |
}
|
81 |
```
|