update readme. better user experience

Files changed (4) hide show

README.md +88 -140
__init__.py +0 -7
config.json +4 -1
modeling_hyperclovax.py +1 -69

README.md CHANGED Viewed

@@ -69,154 +69,102 @@ Although HyperCLOVAX-SEED-Vision-Instruct-3B is a lightweight model, it is capab
 | InternV-2-4B                      | 4096 tokens, 16 frames         | 33.8           | 36.0                | 22.8               | 54.2                  | 52.0                 | 22.7       | 83.0              | 76.9              | 51.6                     | 46.11                  | 39.75                  | 42.58                |
 | InternV-2-8B                      | 4096 tokens, 16 frames         | 43.7           | 41.2                | 32.4               | 58.5                  | 53.2                 | 28.5       | 86.6              | 79.0              | 97.0                     | 50.32                  | 45.79                  | 47.81                |
-## Dependencies for Processor
 - [av](https://github.com/PyAV-Org/PyAV)
 - [decord](https://github.com/dmlc/decord)
 ## Example
 ```python
-import argparse
-import importlib
-import os
-import sys
-from uuid import uuid4
-import psutil
-import torch
 from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
-DEVICE = "cuda:0"
-DTYPE = torch.bfloat16  # It is recommended to use bf16. When using fp16, the output may become corrupted.
-MIN_CPU_MEMORY = 200 * 1024**3
-TOTAL_CPU_MEMORY = psutil.virtual_memory().total
-AVAILABLE_CPU_MEMORY = int(max(min(MIN_CPU_MEMORY, TOTAL_CPU_MEMORY * 0.80), TOTAL_CPU_MEMORY * 0.25))
-def execute_init_py(module_path):
-    module_dir = os.path.dirname(module_path)
-    module_name = os.path.basename(module_path)
-    sys.path.insert(0, module_dir)
-    _ = importlib.import_module(module_name)
-    sys.path.pop(0)
-def main(args):
-    # Register AutoTokenizer, AutoProcessor, AutoModelForCausalLM, and others by running the __init__.py file in the module.
-    execute_init_py(args.model_name_or_path)
-    # trust_remote_code=True! You can trust the HyperCLOVAX model :)
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    preprocessor = AutoProcessor.from_pretrained(args.model_name_or_path, trust_remote_code=True)
-    # model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
-    # model = model.to(device=DEVICE, dtype=DTYPE)
-    max_memory = {"cpu": AVAILABLE_CPU_MEMORY}
-    for device_idx in range(0, args.num_devices):
-        ava_mem, total_mem = torch.cuda.mem_get_info(device_idx)
-        if ava_mem / total_mem <= (1.0 - 0.8):
-            continue
-        print(f"\tcuda_{device_idx}: {ava_mem} / {total_mem}")
-        max_memory[device_idx] = int(ava_mem * 0.8)
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model_name_or_path,
-        low_cpu_mem_usage=True,
-        device_map="sequential",
-        max_memory=max_memory,
-        offload_folder=os.path.join("./tmp/", f"{uuid4()}"),
-        offload_state_dict=True,
-        torch_dtype=DTYPE,
-    )
-    # LLM Example
-    # It is recommended to use the chat template with HyperCLOVAX models.
-    # Using the chat template allows you to easily format your input in ChatML style.
-    chat = [
-        {"role": "system", "content": "you are helpful assistant!"},
-        {"role": "user", "content": "Hello, how are you?"},
-        {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
-        {"role": "user", "content": "I'd like to show off how chat templating works!"},
-    ]
-    input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt", tokenize=True)
-    input_ids = input_ids.to(device=DEVICE)
-    # Please adjust parameters like top_p appropriately for your use case.
-    output_ids = model.generate(
-        input_ids,
-        max_new_tokens=64,
-        do_sample=True,
-        top_p=0.6,
-        temperature=0.5,
-        repetition_penalty=1.0,
-    )
-    print("=" * 80)
-    print("LLM EXAMPLE")
-    print(tokenizer.batch_decode(output_ids)[0])
-    print("=" * 80)
-    # VLM Example
-    # For image and video inputs, you can use url, local_path, base64, or bytes.
-    vlm_chat = [
-        {"role": "system", "content": {"type": "text", "text": "System Prompt"}},
-        {"role": "user", "content": {"type": "text", "text": "User Text 1"}},
-        {
-            "role": "user",
-            "content": {
-                "type": "image",
-                "filename": "tradeoff_sota.png",
-                "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff_sota.png?raw=true",
-                "ocr": "List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order.",
-                "lens_keywords": "Gucci Ophidia, cross bag, Ophidia small, GG, Supreme shoulder bag",
-            ��   "lens_local_keywords": "[0.07, 0.21, 0.92, 0.90] Gucci Ophidia",
-            }
-        },
-        {
-            "role": "user",
-            "content": {
-                "type": "image",
-                "filename": "tradeoff.png",
-                "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff.png?raw=true",
-            }
-        },
-        {"role": "assistant", "content": {"type": "text", "text": "Assistant Text 1"}},
-        {"role": "user", "content": {"type": "text", "text": "User Text 2"}},
-        {
-            "role": "user",
-            "content": {
-                "type": "video",
-                "filename": "rolling-mist-clouds.mp4",
-                "video": "freenaturestock-rolling-mist-clouds.mp4",
-            }
-        },
-        {"role": "user", "content": {"type": "text", "text": "User Text 3"}},
-    ]
-    new_vlm_chat, all_images, is_video_list = preprocessor.load_images_videos(vlm_chat)
-    preprocessed = preprocessor(all_images, is_video_list=is_video_list)
-    input_ids = tokenizer.apply_chat_template(
-        new_vlm_chat, return_tensors="pt", tokenize=True, add_generation_prompt=True,
-    )
-    output_ids = model.generate(
-        input_ids=input_ids.to(device=DEVICE),
-        max_new_tokens=64,
-        do_sample=True,
-        top_p=0.6,
-        temperature=0.5,
-        repetition_penalty=1.0,
-        **preprocessed,
-    )
-    print(tokenizer.batch_decode(output_ids)[0])
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--model_name_or_path", type=str, default="./HyperCLOVAX-Seed-Vision-3B")
-    parser.add_argument("--num_devices", type=int, default=1)
-    args = parser.parse_args()
-    main(args)
 ```
 - To ensure the highest level of image understanding performance, it is recommended to include additional information such as Optical Character Recognition (OCR) results and entity recognition (Lens). The provided usage examples are written under the assumption that OCR and Lens results are available. If you input data in this format, you can expect significantly improved output quality.

 | InternV-2-4B                      | 4096 tokens, 16 frames         | 33.8           | 36.0                | 22.8               | 54.2                  | 52.0                 | 22.7       | 83.0              | 76.9              | 51.6                     | 46.11                  | 39.75                  | 42.58                |
 | InternV-2-8B                      | 4096 tokens, 16 frames         | 43.7           | 41.2                | 32.4               | 58.5                  | 53.2                 | 28.5       | 86.6              | 79.0              | 97.0                     | 50.32                  | 45.79                  | 47.81                |
+## Dependencies
+- [einops](https://einops.rocks/)
+- [timm](https://github.com/huggingface/pytorch-image-models)
 - [av](https://github.com/PyAV-Org/PyAV)
 - [decord](https://github.com/dmlc/decord)
 ## Example
 ```python
 from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device="cuda")
+preprocessor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# LLM Example
+# It is recommended to use the chat template with HyperCLOVAX models.
+# Using the chat template allows you to easily format your input in ChatML style.
+chat = [
+        {"role": "system", "content": "you are helpful assistant!"},
+        {"role": "user", "content": "Hello, how are you?"},
+        {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+        {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt", tokenize=True)
+input_ids = input_ids.to(device="cuda")
+# Please adjust parameters like top_p appropriately for your use case.
+output_ids = model.generate(
+        input_ids,
+        max_new_tokens=64,
+        do_sample=True,
+        top_p=0.6,
+        temperature=0.5,
+        repetition_penalty=1.0,
+)
+print("=" * 80)
+print("LLM EXAMPLE")
+print(tokenizer.batch_decode(output_ids)[0])
+print("=" * 80)
+# VLM Example
+# For image and video inputs, you can use url, local_path, base64, or bytes.
+vlm_chat = [
+        {"role": "system", "content": {"type": "text", "text": "System Prompt"}},
+        {"role": "user", "content": {"type": "text", "text": "User Text 1"}},
+        {
+                "role": "user",
+                "content": {
+                        "type": "image",
+                        "filename": "tradeoff_sota.png",
+                        "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff_sota.png?raw=true",
+                        "ocr": "List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order.",
+                        "lens_keywords": "Gucci Ophidia, cross bag, Ophidia small, GG, Supreme shoulder bag",
+                        "lens_local_keywords": "[0.07, 0.21, 0.92, 0.90] Gucci Ophidia",
+                }
+        },
+        {
+                "role": "user",
+                "content": {
+                        "type": "image",
+                        "filename": "tradeoff.png",
+                        "image": "https://github.com/naver-ai/rdnet/blob/main/resources/images/tradeoff.png?raw=true",
+                }
+        },
+        {"role": "assistant", "content": {"type": "text", "text": "Assistant Text 1"}},
+        {"role": "user", "content": {"type": "text", "text": "User Text 2"}},
+        {
+                "role": "user",
+                "content": {
+                        "type": "video",
+                        "filename": "rolling-mist-clouds.mp4",
+                        "video": "freenaturestock-rolling-mist-clouds.mp4",
+                }
+        },
+        {"role": "user", "content": {"type": "text", "text": "User Text 3"}},
+]
+new_vlm_chat, all_images, is_video_list = preprocessor.load_images_videos(vlm_chat)
+preprocessed = preprocessor(all_images, is_video_list=is_video_list)
+input_ids = tokenizer.apply_chat_template(
+        new_vlm_chat, return_tensors="pt", tokenize=True, add_generation_prompt=True,
+)
+output_ids = model.generate(
+        input_ids=input_ids.to(device="cuda"),
+        max_new_tokens=8192,
+        do_sample=True,
+        top_p=0.6,
+        temperature=0.5,
+        repetition_penalty=1.0,
+        **preprocessed,
+)
+print(tokenizer.batch_decode(output_ids)[0])
 ```
 - To ensure the highest level of image understanding performance, it is recommended to include additional information such as Optical Character Recognition (OCR) results and entity recognition (Lens). The provided usage examples are written under the assumption that OCR and Lens results are available. If you input data in this format, you can expect significantly improved output quality.

__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
-from .configuration_hyperclovax import HCXVisionConfig
-from .modeling_hyperclovax import HCXVisionForCausalLM
-AutoConfig.register("hyperclovax_vlm", HCXVisionConfig)
-AutoModelForCausalLM.register(HCXVisionConfig, HCXVisionForCausalLM)

config.json CHANGED Viewed

@@ -3,6 +3,10 @@
   "architectures": [
     "HCXVisionForCausalLM"
   ],
   "decoder_max_length": 16384,
   "freeze_decoder": false,
   "freeze_encoder": true,
@@ -115,7 +119,6 @@
       "SiglipVisionModel"
     ],
     "attention_dropout": 0.0,
-    "auto_map": {},
     "bad_words_ids": null,
     "begin_suppress_tokens": null,
     "bos_token_id": null,

   "architectures": [
     "HCXVisionForCausalLM"
   ],
+  "auto_map": {
+    "AutoConfig": "configuration_hyperclovax.HCXVisionConfig",
+    "AutoModelForCausalLM": "modeling_hyperclovax.HCXVisionForCausalLM"
+  },
   "decoder_max_length": 16384,
   "freeze_decoder": false,
   "freeze_encoder": true,
       "SiglipVisionModel"
     ],
     "attention_dropout": 0.0,
     "bad_words_ids": null,
     "begin_suppress_tokens": null,
     "bos_token_id": null,

modeling_hyperclovax.py CHANGED Viewed

@@ -24,7 +24,6 @@ from transformers import (
     PreTrainedModel,
 )
 from transformers.generation.utils import GenerationMixin
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import (
     is_fsdp_enabled,
     is_local_dist_rank_0,
@@ -1503,68 +1502,6 @@ class HCXVisionForCausalLM(PreTrainedModel, GenerationMixin):
         return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
-def load_state_dict_into_model(model_to_load, state_dict, strict=True, start_prefix=""):
-    # from https://github.com/huggingface/transformers/blob/0a55d9f7376f72ad3ff296d4249840021b03bcc4/src/transformers/modeling_utils.py#L517
-    # Convert old format to new format if needed from a PyTorch state_dict
-    old_keys = []
-    new_keys = []
-    for key in state_dict.keys():
-        new_key = None
-        if "gamma" in key:
-            new_key = key.replace("gamma", "weight")
-        if "beta" in key:
-            new_key = key.replace("beta", "bias")
-        if new_key:
-            old_keys.append(key)
-            new_keys.append(new_key)
-    for old_key, new_key in zip(old_keys, new_keys):
-        state_dict[new_key] = state_dict.pop(old_key)
-    # copy state_dict so _load_from_state_dict can modify it
-    metadata = getattr(state_dict, "_metadata", None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-        state_dict._metadata = metadata
-    error_msgs = []
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: nn.Module, state_dict, prefix=""):
-        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-        args = (state_dict, prefix, local_metadata, strict, [], [], error_msgs)
-        # Parameters of module and children will start with prefix. We can exit early if there are none in this
-        # state_dict
-        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
-            if is_deepspeed_zero3_enabled():
-                import deepspeed
-                # In sharded models, each shard has only part of the full state_dict, so only gather
-                # parameters that are in the current state_dict.
-                named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-                params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
-                if len(params_to_gather) > 0:
-                    # because zero3 puts placeholders in model params, this context
-                    # manager gathers (unpartitions) the params of the current layer, then loads from
-                    # the state dict and then re-partitions them again
-                    with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
-                        if torch.distributed.get_rank() == 0:
-                            module._load_from_state_dict(*args)
-            else:
-                module._load_from_state_dict(*args)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, state_dict, prefix + name + ".")
-    load(model_to_load, state_dict, prefix=start_prefix)
-    # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
-    # it's safe to delete it.
-    del state_dict
-    return error_msgs
 class HCXVisionCAbstractor(nn.Module):
     """
     This module is based on C-Abstractor, whose license is under apache-2.0.
@@ -1781,12 +1718,7 @@ def load_sharded_checkpoint(
                 for k, v in state_dict.items()
             }
-        if is_deepspeed_zero3_enabled():
-            # torch.distributed.barrier()
-            rank = torch.distributed.get_rank()
-            print(f"# [info] ZeRo3 - load sharded no {i}, rank {rank}")
-            load_state_dict_into_model(model, state_dict, strict=False)
-        elif is_fsdp_enabled():
             if is_local_dist_rank_0():
                 model.load_state_dict(state_dict, strict=False)
         else:

     PreTrainedModel,
 )
 from transformers.generation.utils import GenerationMixin
 from transformers.modeling_utils import (
     is_fsdp_enabled,
     is_local_dist_rank_0,
         return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
 class HCXVisionCAbstractor(nn.Module):
     """
     This module is based on C-Abstractor, whose license is under apache-2.0.
                 for k, v in state_dict.items()
             }
+        if is_fsdp_enabled():
             if is_local_dist_rank_0():
                 model.load_state_dict(state_dict, strict=False)
         else: