dong.hyun commited on
Commit
39ae30a
·
1 Parent(s): 0d880ef

update readme. better user experience

Browse files
Files changed (4) hide show
  1. README.md +88 -140
  2. __init__.py +0 -7
  3. config.json +4 -1
  4. modeling_hyperclovax.py +1 -69
README.md CHANGED
@@ -69,154 +69,102 @@ Although HyperCLOVAX-SEED-Vision-Instruct-3B is a lightweight model, it is capab
69
  | InternV-2-4B | 4096 tokens, 16 frames | 33.8 | 36.0 | 22.8 | 54.2 | 52.0 | 22.7 | 83.0 | 76.9 | 51.6 | 46.11 | 39.75 | 42.58 |
70
  | InternV-2-8B | 4096 tokens, 16 frames | 43.7 | 41.2 | 32.4 | 58.5 | 53.2 | 28.5 | 86.6 | 79.0 | 97.0 | 50.32 | 45.79 | 47.81 |
71
 
72
- ## Dependencies for Processor
 
 
73
  - [av](https://github.com/PyAV-Org/PyAV)
74
  - [decord](https://github.com/dmlc/decord)
75
 
76
  ## Example
77
 
78
  ```python
79
- import argparse
80
- import importlib
81
- import os
82
- import sys
83
- from uuid import uuid4
84
-
85
- import psutil
86
- import torch
87
  from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
88
 
89
- DEVICE = "cuda:0"
90
- DTYPE = torch.bfloat16  # It is recommended to use bf16. When using fp16, the output may become corrupted.
91
- MIN_CPU_MEMORY = 200 * 1024**3
92
- TOTAL_CPU_MEMORY = psutil.virtual_memory().total
93
- AVAILABLE_CPU_MEMORY = int(max(min(MIN_CPU_MEMORY, TOTAL_CPU_MEMORY * 0.80), TOTAL_CPU_MEMORY * 0.25))
94
-
95
-
96
- def execute_init_py(module_path):
97
-     module_dir = os.path.dirname(module_path)
98
-     module_name = os.path.basename(module_path)
99
-
100
-     sys.path.insert(0, module_dir)
101
-     _ = importlib.import_module(module_name)
102
-     sys.path.pop(0)
103
-
104
-
105
- def main(args):
106
-     # Register AutoTokenizer, AutoProcessor, AutoModelForCausalLM, and others by running the __init__.py file in the module.
107
-     execute_init_py(args.model_name_or_path)
108
-
109
-     # trust_remote_code=True! You can trust the HyperCLOVAX model :)
110
-     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
111
-     preprocessor = AutoProcessor.from_pretrained(args.model_name_or_path, trust_remote_code=True)
112
-     # model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
113
-     # model = model.to(device=DEVICE, dtype=DTYPE)
114
-
115
-     max_memory = {"cpu": AVAILABLE_CPU_MEMORY}
116
-     for device_idx in range(0, args.num_devices):
117
-         ava_mem, total_mem = torch.cuda.mem_get_info(device_idx)
118
-         if ava_mem / total_mem <= (1.0 - 0.8):
119
-             continue
120
-         print(f"\tcuda_{device_idx}: {ava_mem} / {total_mem}")
121
-         max_memory[device_idx] = int(ava_mem * 0.8)
122
-     model = AutoModelForCausalLM.from_pretrained(
123
-         args.model_name_or_path,
124
-         low_cpu_mem_usage=True,
125
-         device_map="sequential",
126
-         max_memory=max_memory,
127
-         offload_folder=os.path.join("./tmp/", f"{uuid4()}"),
128
-         offload_state_dict=True,
129
-         torch_dtype=DTYPE,
130
-     )
131
-
132
-     # LLM Example
133
- # It is recommended to use the chat template with HyperCLOVAX models.
134
- # Using the chat template allows you to easily format your input in ChatML style.
135
-     chat = [
136
-         {"role": "system", "content": "you are helpful assistant!"},
137
-         {"role": "user", "content": "Hello, how are you?"},
138
-         {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
139
-         {"role": "user", "content": "I'd like to show off how chat templating works!"},
140
-     ]
141
-     input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt", tokenize=True)
142
-     input_ids = input_ids.to(device=DEVICE)
143
-
144
-     # Please adjust parameters like top_p appropriately for your use case.
145
-     output_ids = model.generate(
146
-         input_ids,
147
-         max_new_tokens=64,
148
-         do_sample=True,
149
-         top_p=0.6,
150
-         temperature=0.5,
151
-         repetition_penalty=1.0,
152
-     )
153
-     print("=" * 80)
154
-     print("LLM EXAMPLE")
155
-     print(tokenizer.batch_decode(output_ids)[0])
156
-     print("=" * 80)
157
-
158
-     # VLM Example
159
-     # For image and video inputs, you can use url, local_path, base64, or bytes.
160
-     vlm_chat = [
161
-         {"role": "system", "content": {"type": "text", "text": "System Prompt"}},
162
-         {"role": "user", "content": {"type": "text", "text": "User Text 1"}},
163
-         {
164
-             "role": "user",
165
-             "content": {
166
-                 "type": "image",
167
-                 "filename": "tradeoff_sota.png",
168
-                 "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff_sota.png?raw=true",
169
-                 "ocr": "List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order.",
170
-                 "lens_keywords": "Gucci Ophidia, cross bag, Ophidia small, GG, Supreme shoulder bag",
171
-             ��   "lens_local_keywords": "[0.07, 0.21, 0.92, 0.90] Gucci Ophidia",
172
-             }
173
-         },
174
-         {
175
-             "role": "user",
176
-             "content": {
177
-                 "type": "image",
178
-                 "filename": "tradeoff.png",
179
-                 "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff.png?raw=true",
180
-             }
181
-         },
182
-         {"role": "assistant", "content": {"type": "text", "text": "Assistant Text 1"}},
183
-         {"role": "user", "content": {"type": "text", "text": "User Text 2"}},
184
-         {
185
-             "role": "user",
186
-             "content": {
187
-                 "type": "video",
188
-                 "filename": "rolling-mist-clouds.mp4",
189
-                 "video": "freenaturestock-rolling-mist-clouds.mp4",
190
-             }
191
-         },
192
-         {"role": "user", "content": {"type": "text", "text": "User Text 3"}},
193
-     ]
194
-
195
-     new_vlm_chat, all_images, is_video_list = preprocessor.load_images_videos(vlm_chat)
196
-     preprocessed = preprocessor(all_images, is_video_list=is_video_list)
197
-     input_ids = tokenizer.apply_chat_template(
198
-         new_vlm_chat, return_tensors="pt", tokenize=True, add_generation_prompt=True,
199
-     )
200
-
201
-     output_ids = model.generate(
202
-         input_ids=input_ids.to(device=DEVICE),
203
-         max_new_tokens=64,
204
-         do_sample=True,
205
-         top_p=0.6,
206
-         temperature=0.5,
207
-         repetition_penalty=1.0,
208
-         **preprocessed,
209
-     )
210
-     print(tokenizer.batch_decode(output_ids)[0])
211
-
212
-
213
- if __name__ == "__main__":
214
-     parser = argparse.ArgumentParser()
215
-     parser.add_argument("-m", "--model_name_or_path", type=str, default="./HyperCLOVAX-Seed-Vision-3B")
216
-     parser.add_argument("--num_devices", type=int, default=1)
217
-     args = parser.parse_args()
218
-
219
-     main(args)
220
  ```
221
 
222
  - To ensure the highest level of image understanding performance, it is recommended to include additional information such as Optical Character Recognition (OCR) results and entity recognition (Lens). The provided usage examples are written under the assumption that OCR and Lens results are available. If you input data in this format, you can expect significantly improved output quality.
 
69
  | InternV-2-4B | 4096 tokens, 16 frames | 33.8 | 36.0 | 22.8 | 54.2 | 52.0 | 22.7 | 83.0 | 76.9 | 51.6 | 46.11 | 39.75 | 42.58 |
70
  | InternV-2-8B | 4096 tokens, 16 frames | 43.7 | 41.2 | 32.4 | 58.5 | 53.2 | 28.5 | 86.6 | 79.0 | 97.0 | 50.32 | 45.79 | 47.81 |
71
 
72
+ ## Dependencies
73
+ - [einops](https://einops.rocks/)
74
+ - [timm](https://github.com/huggingface/pytorch-image-models)
75
  - [av](https://github.com/PyAV-Org/PyAV)
76
  - [decord](https://github.com/dmlc/decord)
77
 
78
  ## Example
79
 
80
  ```python
81
+
 
 
 
 
 
 
 
82
  from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
83
 
84
+ model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
85
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device="cuda")
86
+ preprocessor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
87
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
88
+
89
+ # LLM Example
90
+ # It is recommended to use the chat template with HyperCLOVAX models.
91
+ # Using the chat template allows you to easily format your input in ChatML style.
92
+ chat = [
93
+ {"role": "system", "content": "you are helpful assistant!"},
94
+ {"role": "user", "content": "Hello, how are you?"},
95
+ {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
96
+ {"role": "user", "content": "I'd like to show off how chat templating works!"},
97
+ ]
98
+ input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt", tokenize=True)
99
+ input_ids = input_ids.to(device="cuda")
100
+
101
+ # Please adjust parameters like top_p appropriately for your use case.
102
+ output_ids = model.generate(
103
+ input_ids,
104
+ max_new_tokens=64,
105
+ do_sample=True,
106
+ top_p=0.6,
107
+ temperature=0.5,
108
+ repetition_penalty=1.0,
109
+ )
110
+ print("=" * 80)
111
+ print("LLM EXAMPLE")
112
+ print(tokenizer.batch_decode(output_ids)[0])
113
+ print("=" * 80)
114
+
115
+ # VLM Example
116
+ # For image and video inputs, you can use url, local_path, base64, or bytes.
117
+ vlm_chat = [
118
+ {"role": "system", "content": {"type": "text", "text": "System Prompt"}},
119
+ {"role": "user", "content": {"type": "text", "text": "User Text 1"}},
120
+ {
121
+ "role": "user",
122
+ "content": {
123
+ "type": "image",
124
+ "filename": "tradeoff_sota.png",
125
+ "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff_sota.png?raw=true",
126
+ "ocr": "List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order.",
127
+ "lens_keywords": "Gucci Ophidia, cross bag, Ophidia small, GG, Supreme shoulder bag",
128
+ "lens_local_keywords": "[0.07, 0.21, 0.92, 0.90] Gucci Ophidia",
129
+ }
130
+ },
131
+ {
132
+ "role": "user",
133
+ "content": {
134
+ "type": "image",
135
+ "filename": "tradeoff.png",
136
+ "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff.png?raw=true",
137
+ }
138
+ },
139
+ {"role": "assistant", "content": {"type": "text", "text": "Assistant Text 1"}},
140
+ {"role": "user", "content": {"type": "text", "text": "User Text 2"}},
141
+ {
142
+ "role": "user",
143
+ "content": {
144
+ "type": "video",
145
+ "filename": "rolling-mist-clouds.mp4",
146
+ "video": "freenaturestock-rolling-mist-clouds.mp4",
147
+ }
148
+ },
149
+ {"role": "user", "content": {"type": "text", "text": "User Text 3"}},
150
+ ]
151
+
152
+ new_vlm_chat, all_images, is_video_list = preprocessor.load_images_videos(vlm_chat)
153
+ preprocessed = preprocessor(all_images, is_video_list=is_video_list)
154
+ input_ids = tokenizer.apply_chat_template(
155
+ new_vlm_chat, return_tensors="pt", tokenize=True, add_generation_prompt=True,
156
+ )
157
+
158
+ output_ids = model.generate(
159
+ input_ids=input_ids.to(device="cuda"),
160
+ max_new_tokens=8192,
161
+ do_sample=True,
162
+ top_p=0.6,
163
+ temperature=0.5,
164
+ repetition_penalty=1.0,
165
+ **preprocessed,
166
+ )
167
+ print(tokenizer.batch_decode(output_ids)[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  ```
169
 
170
  - To ensure the highest level of image understanding performance, it is recommended to include additional information such as Optical Character Recognition (OCR) results and entity recognition (Lens). The provided usage examples are written under the assumption that OCR and Lens results are available. If you input data in this format, you can expect significantly improved output quality.
__init__.py DELETED
@@ -1,7 +0,0 @@
1
- from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
2
-
3
- from .configuration_hyperclovax import HCXVisionConfig
4
- from .modeling_hyperclovax import HCXVisionForCausalLM
5
-
6
- AutoConfig.register("hyperclovax_vlm", HCXVisionConfig)
7
- AutoModelForCausalLM.register(HCXVisionConfig, HCXVisionForCausalLM)
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -3,6 +3,10 @@
3
  "architectures": [
4
  "HCXVisionForCausalLM"
5
  ],
 
 
 
 
6
  "decoder_max_length": 16384,
7
  "freeze_decoder": false,
8
  "freeze_encoder": true,
@@ -115,7 +119,6 @@
115
  "SiglipVisionModel"
116
  ],
117
  "attention_dropout": 0.0,
118
- "auto_map": {},
119
  "bad_words_ids": null,
120
  "begin_suppress_tokens": null,
121
  "bos_token_id": null,
 
3
  "architectures": [
4
  "HCXVisionForCausalLM"
5
  ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_hyperclovax.HCXVisionConfig",
8
+ "AutoModelForCausalLM": "modeling_hyperclovax.HCXVisionForCausalLM"
9
+ },
10
  "decoder_max_length": 16384,
11
  "freeze_decoder": false,
12
  "freeze_encoder": true,
 
119
  "SiglipVisionModel"
120
  ],
121
  "attention_dropout": 0.0,
 
122
  "bad_words_ids": null,
123
  "begin_suppress_tokens": null,
124
  "bos_token_id": null,
modeling_hyperclovax.py CHANGED
@@ -24,7 +24,6 @@ from transformers import (
24
  PreTrainedModel,
25
  )
26
  from transformers.generation.utils import GenerationMixin
27
- from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
28
  from transformers.modeling_utils import (
29
  is_fsdp_enabled,
30
  is_local_dist_rank_0,
@@ -1503,68 +1502,6 @@ class HCXVisionForCausalLM(PreTrainedModel, GenerationMixin):
1503
  return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
1504
 
1505
 
1506
- def load_state_dict_into_model(model_to_load, state_dict, strict=True, start_prefix=""):
1507
- # from https://github.com/huggingface/transformers/blob/0a55d9f7376f72ad3ff296d4249840021b03bcc4/src/transformers/modeling_utils.py#L517
1508
- # Convert old format to new format if needed from a PyTorch state_dict
1509
- old_keys = []
1510
- new_keys = []
1511
- for key in state_dict.keys():
1512
- new_key = None
1513
- if "gamma" in key:
1514
- new_key = key.replace("gamma", "weight")
1515
- if "beta" in key:
1516
- new_key = key.replace("beta", "bias")
1517
- if new_key:
1518
- old_keys.append(key)
1519
- new_keys.append(new_key)
1520
- for old_key, new_key in zip(old_keys, new_keys):
1521
- state_dict[new_key] = state_dict.pop(old_key)
1522
-
1523
- # copy state_dict so _load_from_state_dict can modify it
1524
- metadata = getattr(state_dict, "_metadata", None)
1525
- state_dict = state_dict.copy()
1526
- if metadata is not None:
1527
- state_dict._metadata = metadata
1528
-
1529
- error_msgs = []
1530
-
1531
- # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
1532
- # so we need to apply the function recursively.
1533
- def load(module: nn.Module, state_dict, prefix=""):
1534
- local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
1535
- args = (state_dict, prefix, local_metadata, strict, [], [], error_msgs)
1536
- # Parameters of module and children will start with prefix. We can exit early if there are none in this
1537
- # state_dict
1538
- if len([key for key in state_dict if key.startswith(prefix)]) > 0:
1539
- if is_deepspeed_zero3_enabled():
1540
- import deepspeed
1541
-
1542
- # In sharded models, each shard has only part of the full state_dict, so only gather
1543
- # parameters that are in the current state_dict.
1544
- named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
1545
- params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
1546
- if len(params_to_gather) > 0:
1547
- # because zero3 puts placeholders in model params, this context
1548
- # manager gathers (unpartitions) the params of the current layer, then loads from
1549
- # the state dict and then re-partitions them again
1550
- with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
1551
- if torch.distributed.get_rank() == 0:
1552
- module._load_from_state_dict(*args)
1553
- else:
1554
- module._load_from_state_dict(*args)
1555
-
1556
- for name, child in module._modules.items():
1557
- if child is not None:
1558
- load(child, state_dict, prefix + name + ".")
1559
-
1560
- load(model_to_load, state_dict, prefix=start_prefix)
1561
- # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
1562
- # it's safe to delete it.
1563
- del state_dict
1564
-
1565
- return error_msgs
1566
-
1567
-
1568
  class HCXVisionCAbstractor(nn.Module):
1569
  """
1570
  This module is based on C-Abstractor, whose license is under apache-2.0.
@@ -1781,12 +1718,7 @@ def load_sharded_checkpoint(
1781
  for k, v in state_dict.items()
1782
  }
1783
 
1784
- if is_deepspeed_zero3_enabled():
1785
- # torch.distributed.barrier()
1786
- rank = torch.distributed.get_rank()
1787
- print(f"# [info] ZeRo3 - load sharded no {i}, rank {rank}")
1788
- load_state_dict_into_model(model, state_dict, strict=False)
1789
- elif is_fsdp_enabled():
1790
  if is_local_dist_rank_0():
1791
  model.load_state_dict(state_dict, strict=False)
1792
  else:
 
24
  PreTrainedModel,
25
  )
26
  from transformers.generation.utils import GenerationMixin
 
27
  from transformers.modeling_utils import (
28
  is_fsdp_enabled,
29
  is_local_dist_rank_0,
 
1502
  return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
1503
 
1504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1505
  class HCXVisionCAbstractor(nn.Module):
1506
  """
1507
  This module is based on C-Abstractor, whose license is under apache-2.0.
 
1718
  for k, v in state_dict.items()
1719
  }
1720
 
1721
+ if is_fsdp_enabled():
 
 
 
 
 
1722
  if is_local_dist_rank_0():
1723
  model.load_state_dict(state_dict, strict=False)
1724
  else: