dong.hyun
commited on
Commit
·
39ae30a
1
Parent(s):
0d880ef
update readme. better user experience
Browse files- README.md +88 -140
- __init__.py +0 -7
- config.json +4 -1
- modeling_hyperclovax.py +1 -69
README.md
CHANGED
@@ -69,154 +69,102 @@ Although HyperCLOVAX-SEED-Vision-Instruct-3B is a lightweight model, it is capab
|
|
69 |
| InternV-2-4B | 4096 tokens, 16 frames | 33.8 | 36.0 | 22.8 | 54.2 | 52.0 | 22.7 | 83.0 | 76.9 | 51.6 | 46.11 | 39.75 | 42.58 |
|
70 |
| InternV-2-8B | 4096 tokens, 16 frames | 43.7 | 41.2 | 32.4 | 58.5 | 53.2 | 28.5 | 86.6 | 79.0 | 97.0 | 50.32 | 45.79 | 47.81 |
|
71 |
|
72 |
-
## Dependencies
|
|
|
|
|
73 |
- [av](https://github.com/PyAV-Org/PyAV)
|
74 |
- [decord](https://github.com/dmlc/decord)
|
75 |
|
76 |
## Example
|
77 |
|
78 |
```python
|
79 |
-
|
80 |
-
import importlib
|
81 |
-
import os
|
82 |
-
import sys
|
83 |
-
from uuid import uuid4
|
84 |
-
|
85 |
-
import psutil
|
86 |
-
import torch
|
87 |
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
},
|
174 |
-
{
|
175 |
-
"role": "user",
|
176 |
-
"content": {
|
177 |
-
"type": "image",
|
178 |
-
"filename": "tradeoff.png",
|
179 |
-
"image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff.png?raw=true",
|
180 |
-
}
|
181 |
-
},
|
182 |
-
{"role": "assistant", "content": {"type": "text", "text": "Assistant Text 1"}},
|
183 |
-
{"role": "user", "content": {"type": "text", "text": "User Text 2"}},
|
184 |
-
{
|
185 |
-
"role": "user",
|
186 |
-
"content": {
|
187 |
-
"type": "video",
|
188 |
-
"filename": "rolling-mist-clouds.mp4",
|
189 |
-
"video": "freenaturestock-rolling-mist-clouds.mp4",
|
190 |
-
}
|
191 |
-
},
|
192 |
-
{"role": "user", "content": {"type": "text", "text": "User Text 3"}},
|
193 |
-
]
|
194 |
-
|
195 |
-
new_vlm_chat, all_images, is_video_list = preprocessor.load_images_videos(vlm_chat)
|
196 |
-
preprocessed = preprocessor(all_images, is_video_list=is_video_list)
|
197 |
-
input_ids = tokenizer.apply_chat_template(
|
198 |
-
new_vlm_chat, return_tensors="pt", tokenize=True, add_generation_prompt=True,
|
199 |
-
)
|
200 |
-
|
201 |
-
output_ids = model.generate(
|
202 |
-
input_ids=input_ids.to(device=DEVICE),
|
203 |
-
max_new_tokens=64,
|
204 |
-
do_sample=True,
|
205 |
-
top_p=0.6,
|
206 |
-
temperature=0.5,
|
207 |
-
repetition_penalty=1.0,
|
208 |
-
**preprocessed,
|
209 |
-
)
|
210 |
-
print(tokenizer.batch_decode(output_ids)[0])
|
211 |
-
|
212 |
-
|
213 |
-
if __name__ == "__main__":
|
214 |
-
parser = argparse.ArgumentParser()
|
215 |
-
parser.add_argument("-m", "--model_name_or_path", type=str, default="./HyperCLOVAX-Seed-Vision-3B")
|
216 |
-
parser.add_argument("--num_devices", type=int, default=1)
|
217 |
-
args = parser.parse_args()
|
218 |
-
|
219 |
-
main(args)
|
220 |
```
|
221 |
|
222 |
- To ensure the highest level of image understanding performance, it is recommended to include additional information such as Optical Character Recognition (OCR) results and entity recognition (Lens). The provided usage examples are written under the assumption that OCR and Lens results are available. If you input data in this format, you can expect significantly improved output quality.
|
|
|
69 |
| InternV-2-4B | 4096 tokens, 16 frames | 33.8 | 36.0 | 22.8 | 54.2 | 52.0 | 22.7 | 83.0 | 76.9 | 51.6 | 46.11 | 39.75 | 42.58 |
|
70 |
| InternV-2-8B | 4096 tokens, 16 frames | 43.7 | 41.2 | 32.4 | 58.5 | 53.2 | 28.5 | 86.6 | 79.0 | 97.0 | 50.32 | 45.79 | 47.81 |
|
71 |
|
72 |
+
## Dependencies
|
73 |
+
- [einops](https://einops.rocks/)
|
74 |
+
- [timm](https://github.com/huggingface/pytorch-image-models)
|
75 |
- [av](https://github.com/PyAV-Org/PyAV)
|
76 |
- [decord](https://github.com/dmlc/decord)
|
77 |
|
78 |
## Example
|
79 |
|
80 |
```python
|
81 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
|
83 |
|
84 |
+
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
85 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device="cuda")
|
86 |
+
preprocessor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
87 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
88 |
+
|
89 |
+
# LLM Example
|
90 |
+
# It is recommended to use the chat template with HyperCLOVAX models.
|
91 |
+
# Using the chat template allows you to easily format your input in ChatML style.
|
92 |
+
chat = [
|
93 |
+
{"role": "system", "content": "you are helpful assistant!"},
|
94 |
+
{"role": "user", "content": "Hello, how are you?"},
|
95 |
+
{"role": "assistant", "content": "I'm doing great. How can I help you today?"},
|
96 |
+
{"role": "user", "content": "I'd like to show off how chat templating works!"},
|
97 |
+
]
|
98 |
+
input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt", tokenize=True)
|
99 |
+
input_ids = input_ids.to(device="cuda")
|
100 |
+
|
101 |
+
# Please adjust parameters like top_p appropriately for your use case.
|
102 |
+
output_ids = model.generate(
|
103 |
+
input_ids,
|
104 |
+
max_new_tokens=64,
|
105 |
+
do_sample=True,
|
106 |
+
top_p=0.6,
|
107 |
+
temperature=0.5,
|
108 |
+
repetition_penalty=1.0,
|
109 |
+
)
|
110 |
+
print("=" * 80)
|
111 |
+
print("LLM EXAMPLE")
|
112 |
+
print(tokenizer.batch_decode(output_ids)[0])
|
113 |
+
print("=" * 80)
|
114 |
+
|
115 |
+
# VLM Example
|
116 |
+
# For image and video inputs, you can use url, local_path, base64, or bytes.
|
117 |
+
vlm_chat = [
|
118 |
+
{"role": "system", "content": {"type": "text", "text": "System Prompt"}},
|
119 |
+
{"role": "user", "content": {"type": "text", "text": "User Text 1"}},
|
120 |
+
{
|
121 |
+
"role": "user",
|
122 |
+
"content": {
|
123 |
+
"type": "image",
|
124 |
+
"filename": "tradeoff_sota.png",
|
125 |
+
"image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff_sota.png?raw=true",
|
126 |
+
"ocr": "List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order.",
|
127 |
+
"lens_keywords": "Gucci Ophidia, cross bag, Ophidia small, GG, Supreme shoulder bag",
|
128 |
+
"lens_local_keywords": "[0.07, 0.21, 0.92, 0.90] Gucci Ophidia",
|
129 |
+
}
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"role": "user",
|
133 |
+
"content": {
|
134 |
+
"type": "image",
|
135 |
+
"filename": "tradeoff.png",
|
136 |
+
"image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff.png?raw=true",
|
137 |
+
}
|
138 |
+
},
|
139 |
+
{"role": "assistant", "content": {"type": "text", "text": "Assistant Text 1"}},
|
140 |
+
{"role": "user", "content": {"type": "text", "text": "User Text 2"}},
|
141 |
+
{
|
142 |
+
"role": "user",
|
143 |
+
"content": {
|
144 |
+
"type": "video",
|
145 |
+
"filename": "rolling-mist-clouds.mp4",
|
146 |
+
"video": "freenaturestock-rolling-mist-clouds.mp4",
|
147 |
+
}
|
148 |
+
},
|
149 |
+
{"role": "user", "content": {"type": "text", "text": "User Text 3"}},
|
150 |
+
]
|
151 |
+
|
152 |
+
new_vlm_chat, all_images, is_video_list = preprocessor.load_images_videos(vlm_chat)
|
153 |
+
preprocessed = preprocessor(all_images, is_video_list=is_video_list)
|
154 |
+
input_ids = tokenizer.apply_chat_template(
|
155 |
+
new_vlm_chat, return_tensors="pt", tokenize=True, add_generation_prompt=True,
|
156 |
+
)
|
157 |
+
|
158 |
+
output_ids = model.generate(
|
159 |
+
input_ids=input_ids.to(device="cuda"),
|
160 |
+
max_new_tokens=8192,
|
161 |
+
do_sample=True,
|
162 |
+
top_p=0.6,
|
163 |
+
temperature=0.5,
|
164 |
+
repetition_penalty=1.0,
|
165 |
+
**preprocessed,
|
166 |
+
)
|
167 |
+
print(tokenizer.batch_decode(output_ids)[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
```
|
169 |
|
170 |
- To ensure the highest level of image understanding performance, it is recommended to include additional information such as Optical Character Recognition (OCR) results and entity recognition (Lens). The provided usage examples are written under the assumption that OCR and Lens results are available. If you input data in this format, you can expect significantly improved output quality.
|
__init__.py
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
|
2 |
-
|
3 |
-
from .configuration_hyperclovax import HCXVisionConfig
|
4 |
-
from .modeling_hyperclovax import HCXVisionForCausalLM
|
5 |
-
|
6 |
-
AutoConfig.register("hyperclovax_vlm", HCXVisionConfig)
|
7 |
-
AutoModelForCausalLM.register(HCXVisionConfig, HCXVisionForCausalLM)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -3,6 +3,10 @@
|
|
3 |
"architectures": [
|
4 |
"HCXVisionForCausalLM"
|
5 |
],
|
|
|
|
|
|
|
|
|
6 |
"decoder_max_length": 16384,
|
7 |
"freeze_decoder": false,
|
8 |
"freeze_encoder": true,
|
@@ -115,7 +119,6 @@
|
|
115 |
"SiglipVisionModel"
|
116 |
],
|
117 |
"attention_dropout": 0.0,
|
118 |
-
"auto_map": {},
|
119 |
"bad_words_ids": null,
|
120 |
"begin_suppress_tokens": null,
|
121 |
"bos_token_id": null,
|
|
|
3 |
"architectures": [
|
4 |
"HCXVisionForCausalLM"
|
5 |
],
|
6 |
+
"auto_map": {
|
7 |
+
"AutoConfig": "configuration_hyperclovax.HCXVisionConfig",
|
8 |
+
"AutoModelForCausalLM": "modeling_hyperclovax.HCXVisionForCausalLM"
|
9 |
+
},
|
10 |
"decoder_max_length": 16384,
|
11 |
"freeze_decoder": false,
|
12 |
"freeze_encoder": true,
|
|
|
119 |
"SiglipVisionModel"
|
120 |
],
|
121 |
"attention_dropout": 0.0,
|
|
|
122 |
"bad_words_ids": null,
|
123 |
"begin_suppress_tokens": null,
|
124 |
"bos_token_id": null,
|
modeling_hyperclovax.py
CHANGED
@@ -24,7 +24,6 @@ from transformers import (
|
|
24 |
PreTrainedModel,
|
25 |
)
|
26 |
from transformers.generation.utils import GenerationMixin
|
27 |
-
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
28 |
from transformers.modeling_utils import (
|
29 |
is_fsdp_enabled,
|
30 |
is_local_dist_rank_0,
|
@@ -1503,68 +1502,6 @@ class HCXVisionForCausalLM(PreTrainedModel, GenerationMixin):
|
|
1503 |
return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
|
1504 |
|
1505 |
|
1506 |
-
def load_state_dict_into_model(model_to_load, state_dict, strict=True, start_prefix=""):
|
1507 |
-
# from https://github.com/huggingface/transformers/blob/0a55d9f7376f72ad3ff296d4249840021b03bcc4/src/transformers/modeling_utils.py#L517
|
1508 |
-
# Convert old format to new format if needed from a PyTorch state_dict
|
1509 |
-
old_keys = []
|
1510 |
-
new_keys = []
|
1511 |
-
for key in state_dict.keys():
|
1512 |
-
new_key = None
|
1513 |
-
if "gamma" in key:
|
1514 |
-
new_key = key.replace("gamma", "weight")
|
1515 |
-
if "beta" in key:
|
1516 |
-
new_key = key.replace("beta", "bias")
|
1517 |
-
if new_key:
|
1518 |
-
old_keys.append(key)
|
1519 |
-
new_keys.append(new_key)
|
1520 |
-
for old_key, new_key in zip(old_keys, new_keys):
|
1521 |
-
state_dict[new_key] = state_dict.pop(old_key)
|
1522 |
-
|
1523 |
-
# copy state_dict so _load_from_state_dict can modify it
|
1524 |
-
metadata = getattr(state_dict, "_metadata", None)
|
1525 |
-
state_dict = state_dict.copy()
|
1526 |
-
if metadata is not None:
|
1527 |
-
state_dict._metadata = metadata
|
1528 |
-
|
1529 |
-
error_msgs = []
|
1530 |
-
|
1531 |
-
# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
|
1532 |
-
# so we need to apply the function recursively.
|
1533 |
-
def load(module: nn.Module, state_dict, prefix=""):
|
1534 |
-
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
|
1535 |
-
args = (state_dict, prefix, local_metadata, strict, [], [], error_msgs)
|
1536 |
-
# Parameters of module and children will start with prefix. We can exit early if there are none in this
|
1537 |
-
# state_dict
|
1538 |
-
if len([key for key in state_dict if key.startswith(prefix)]) > 0:
|
1539 |
-
if is_deepspeed_zero3_enabled():
|
1540 |
-
import deepspeed
|
1541 |
-
|
1542 |
-
# In sharded models, each shard has only part of the full state_dict, so only gather
|
1543 |
-
# parameters that are in the current state_dict.
|
1544 |
-
named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
|
1545 |
-
params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
|
1546 |
-
if len(params_to_gather) > 0:
|
1547 |
-
# because zero3 puts placeholders in model params, this context
|
1548 |
-
# manager gathers (unpartitions) the params of the current layer, then loads from
|
1549 |
-
# the state dict and then re-partitions them again
|
1550 |
-
with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
|
1551 |
-
if torch.distributed.get_rank() == 0:
|
1552 |
-
module._load_from_state_dict(*args)
|
1553 |
-
else:
|
1554 |
-
module._load_from_state_dict(*args)
|
1555 |
-
|
1556 |
-
for name, child in module._modules.items():
|
1557 |
-
if child is not None:
|
1558 |
-
load(child, state_dict, prefix + name + ".")
|
1559 |
-
|
1560 |
-
load(model_to_load, state_dict, prefix=start_prefix)
|
1561 |
-
# Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
|
1562 |
-
# it's safe to delete it.
|
1563 |
-
del state_dict
|
1564 |
-
|
1565 |
-
return error_msgs
|
1566 |
-
|
1567 |
-
|
1568 |
class HCXVisionCAbstractor(nn.Module):
|
1569 |
"""
|
1570 |
This module is based on C-Abstractor, whose license is under apache-2.0.
|
@@ -1781,12 +1718,7 @@ def load_sharded_checkpoint(
|
|
1781 |
for k, v in state_dict.items()
|
1782 |
}
|
1783 |
|
1784 |
-
if
|
1785 |
-
# torch.distributed.barrier()
|
1786 |
-
rank = torch.distributed.get_rank()
|
1787 |
-
print(f"# [info] ZeRo3 - load sharded no {i}, rank {rank}")
|
1788 |
-
load_state_dict_into_model(model, state_dict, strict=False)
|
1789 |
-
elif is_fsdp_enabled():
|
1790 |
if is_local_dist_rank_0():
|
1791 |
model.load_state_dict(state_dict, strict=False)
|
1792 |
else:
|
|
|
24 |
PreTrainedModel,
|
25 |
)
|
26 |
from transformers.generation.utils import GenerationMixin
|
|
|
27 |
from transformers.modeling_utils import (
|
28 |
is_fsdp_enabled,
|
29 |
is_local_dist_rank_0,
|
|
|
1502 |
return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
|
1503 |
|
1504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1505 |
class HCXVisionCAbstractor(nn.Module):
|
1506 |
"""
|
1507 |
This module is based on C-Abstractor, whose license is under apache-2.0.
|
|
|
1718 |
for k, v in state_dict.items()
|
1719 |
}
|
1720 |
|
1721 |
+
if is_fsdp_enabled():
|
|
|
|
|
|
|
|
|
|
|
1722 |
if is_local_dist_rank_0():
|
1723 |
model.load_state_dict(state_dict, strict=False)
|
1724 |
else:
|