How did you convert it? mlx-lm told me that 'ERROR:root:Model type glm4v not supported'
ERROR:root:Model type glm4v not supported.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/mlx_lm/utils.py", line 65, in _get_classes
arch = importlib.import_module(f"mlx_lm.models.{model_type}")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/importlib/__init__.py", line 90, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
File "<frozen importlib._bootstrap>", line 1324, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'mlx_lm.models.glm4v'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/anaconda3/bin/mlx_lm.convert", line 8, in <module>
sys.exit(main())
^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/mlx_lm/convert.py", line 217, in main
convert(**vars(args))
File "/opt/anaconda3/lib/python3.12/site-packages/mlx_lm/convert.py", line 112, in convert
model, config, tokenizer = fetch_from_hub(model_path, lazy=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/mlx_lm/utils.py", line 272, in fetch_from_hub
model, config = load_model(model_path, lazy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/mlx_lm/utils.py", line 185, in load_model
model_class, model_args_class = get_model_classes(config=config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/mlx_lm/utils.py", line 69, in _get_classes
raise ValueError(msg)
ValueError: Model type glm4v not supported.
You have to manually config a model file (named: glm4v.py) for mlx-lm under “ site-packages/mlx_lm/models” directory. However, I have found that the converted model files are missing three key parameters for model loading. Manually adding these three parameters can load the model to some extent but resulting abnormal model behavior in vision reasoning. You can see my GitHub repo for the model files you needed to load the model (replicate the issue actually 🤣) by adding the glmv4 folder to the MLX-VLM package. I am still trying to figure out how to load the model properly.
I just noticed that hugging face has automatically created a model card for me which contains problematic info. I have made corresponding amendments to it.
Great! Thank you so much ~
Hello,
First, thank you for the great work. I've tried following the instructions in your README, but the code to paste into vision.py, language.py, and glm4v.py appears to be the same for each file. Could you please update it?
Thanks! 🙏
You have to manually config a model file (named: glm4v.py) for mlx-lm under “ site-packages/mlx_lm/models” directory. However, I have found that the converted model files are missing three key parameters for model loading. Manually adding these three parameters can load the model to some extent but resulting abnormal model behavior in vision reasoning. You can see my GitHub repo for the model files you needed to load the model (replicate the issue actually 🤣) by adding the glmv4 folder to the MLX-VLM package. I am still trying to figure out how to load the model properly.
BTW, which project are you referring to to write this code for?
It seems that we should use "nn.Conv3d" instead of "nn.Conv2d" in PatchEmbed, referring to transformers.models.glm4v.modular_glm4v.Glm4vVisionPatchEmbed and mlx_vlm.models.qwen2_5_vl.vision.PatchEmbed.
Just copied from mlx_vlm.models.qwen2_5_vl.vision.PatchEmbed, but I don't know if the logic is correct:
class PatchEmbed(nn.Module):
def __init__(self, config: VisionConfig):
super().__init__()
self.patch_size = config.patch_size
self.temporal_patch_size = config.temporal_patch_size
self.in_channels = config.num_channels
self.hidden_size = config.hidden_size
kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
self.proj = nn.Conv3d(
config.num_channels,
config.hidden_size,
kernel_size=kernel_size,
stride=kernel_size,
bias=True
)
def __call__(self, hidden_states: mx.array) -> mx.array:
hidden_states = hidden_states.reshape(
-1,
self.in_channels,
self.temporal_patch_size,
self.patch_size,
self.patch_size,
).moveaxis(1, 4)
hidden_states = self.proj(hidden_states)
hidden_states = hidden_states.reshape(-1, self.hidden_size)
return hidden_states
Use dimen == 5 to check Conv3d weight and use dimen = 4 to check Conv2d weight:
def check_array_shape(arr):
shape = arr.shape
dimen = len(shape)
if dimen == 5:
B, out_channels, kH, KW, in_channels = shape
elif dimen == 4:
out_channels, kH, KW, in_channels = shape
else:
return False
if in_channels == 3:
return True
# Check if out_channels is the largest, and kH and KW are the same
if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
return True
else:
return False
And then add sanitize for glm4v/vision.py:
def sanitize(self, weights):
sanitized_weights = {}
for k, v in weights.items():
if "position_ids" in k:
# Remove unused position_ids
continue
elif "patch_embed.proj.weight" in k:
# PyTorch conv3d weight tensors have shape:
# [batch_dimension, out_channels, in_channels, kH, KW]
# MLX conv3d expects the weight be of shape:
# [batch_dimension, out_channels, kH, KW, in_channels]
if check_array_shape(v):
sanitized_weights[k] = v
else:
sanitized_weights[k] = v.transpose(0, 2, 3, 4, 1)
elif "downsample.weight" in k:
# PyTorch conv2d weight tensors have shape:
# [out_channels, in_channels, kH, KW]
# MLX conv2d expects the weight be of shape:
# [out_channels, kH, KW, in_channels]
if check_array_shape(v):
sanitized_weights[k] = v
else:
sanitized_weights[k] = v.transpose(0, 2, 3, 1)
else:
sanitized_weights[k] = v
return sanitized_weights
Hello,
First, thank you for the great work. I've tried following the instructions in your README, but the code to paste into vision.py, language.py, and glm4v.py appears to be the same for each file. Could you please update it?
Thanks! 🙏
Ok I get the code from your github and I can make the model works with mlx-vlm without image. But when I pass an image I get this error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 18
13 formatted_prompt = apply_chat_template(
14 processor, config, prompt, num_images=len(image)
15 )
17 # Generate output
---> 18 output = generate(model, processor, formatted_prompt, image, verbose=False)
19 print(output)
File ~.../site-packages/mlx_vlm/generate.py:534, in generate(model, processor, prompt, image, audio, verbose, **kwargs)
531 else:
532 tokenizer.stopping_criteria.reset(model.config.eos_token_id)
--> 534 for response in stream_generate(model, processor, prompt, image, audio, **kwargs):
535 if verbose:
536 print(response.text, end="", flush=True)
File ~.../site-packages/mlx_vlm/generate.py:424, in stream_generate(model, processor, prompt, image, audio, **kwargs)
422 detokenizer.reset()
423 tic = time.perf_counter()
--> 424 for n, (token, logprobs) in enumerate(
425 generate_step(input_ids, model, pixel_values, mask, **kwargs)
426 ):
427 if n == 0:
428 prompt_time = time.perf_counter() - tic
File ~.../site-packages/mlx_vlm/generate.py:312, in generate_step(input_ids, model, pixel_values, mask, max_tokens, temperature, repetition_penalty, repetition_context_size, top_p, logit_bias, prompt_cache, max_kv_size, kv_bits, kv_group_size, quantized_kv_start, **kwargs)
309 quantize_cache_fn(prompt_cache)
310 return y, logprobs.squeeze(0)
--> 312 outputs = model(input_ids, pixel_values, cache=prompt_cache, mask=mask, **kwargs)
314 logits = outputs.logits[:, -1, :]
315 quantize_cache_fn(prompt_cache)
File ~..../site-packages/mlx_vlm/models/glm4v/glm4v.py:148, in Model.__call__(self, input_ids, pixel_values, mask, cache, inputs_embeds, **kwargs)
138 def __call__(
139 self,
140 input_ids: mx.array,
(...)
145 **kwargs, # <-- THIS IS THE FINAL FIX
146 ):
147 if inputs_embeds is None:
--> 148 inputs_embeds, _ = self.get_input_embeddings(
149 input_ids, pixel_values, mask
150 )
152 return self.language_model(
153 inputs=input_ids,
154 cache=cache,
155 inputs_embeds=inputs_embeds,
156 )
File ~.../site-packages/mlx_vlm/models/glm4v/glm4v.py:93, in Model.get_input_embeddings(self, input_ids, pixel_values, mask)
89 # This case is for the first multi-modal input
90 inputs_embeds = self.language_model.model.embed_tokens(input_ids)
92 vision_outputs = self.vision_tower(
---> 93 pixel_values.transpose(0, 2, 3, 1).astype(inputs_embeds.dtype)
94 )
96 vision_outputs = mx.einsum(
97 "btm,md->btd",
98 self.multi_modal_projector.mm_soft_emb_norm(vision_outputs),
99 self.multi_modal_projector.mm_input_projection_weight,
100 )
102 final_inputs_embeds = self.prepare_inputs_for_multimodal(
103 vision_outputs, inputs_embeds, input_ids
104 )
ValueError: [transpose] Recived 4 axes for array with 2 dimensions.
with
from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config
model_path = "Rainnighttram/GLM-4.1V-9B-MLX-4bit"
model, processor = load(model_path)
config = load_config(model_path)
image = ['image.jpg']
prompt = 'Describe this image.'
# Apply chat template
formatted_prompt = apply_chat_template(
processor, config, prompt, num_images=len(image)
)
# Generate output
output = generate(model, processor, formatted_prompt, image, verbose=False)
print(output)