Model issue with 64GB ram
Loading checkpoint shards: 0%| | 0/13 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/petrus/AI/Transformers/llamaGOD.py", line 12, in <module>
model = Llama4ForConditionalGeneration.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4400, in from_pretrained
) = cls._load_pretrained_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4793, in _load_pretrained_model
caching_allocator_warmup(model_to_load, expanded_device_map, factor=2 if hf_quantizer is None else 4)
File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 5799, in caching_allocator_warmup
_ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: [enforce fail at alloc_cpu.cpp:118] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 108641793536 bytes. Error code 12 (Cannot allocate memory)
Loading checkpoint shards: 0%| | 0/13 [00:00<?, ?it/s]
using the newest checkpoints with the following script:
from transformers import AutoProcessor, Llama4ForConditionalGeneration
import torch
#from transformers import modeling_utils
# Disable the caching allocator warmup
#modeling_utils.caching_allocator_warmup = lambda *args, **kwargs: None
model_id = "/home/petrus/AI/LLaMa-4-Scout-A17B-100B/"
processor = AutoProcessor.from_pretrained(model_id)
model = Llama4ForConditionalGeneration.from_pretrained(
model_id,
attn_implementation="flex_attention",
device_map="cpu",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
# max_seq_len=4096,
)
url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": url1},
{"type": "image", "url": url2},
{"type": "text", "text": "Can you describe how these two images are similar, and how they differ?"},
]
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=256,
)
response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
print(response)
print(outputs[0])
edit:
also changed the following in config.json:
"max_position_embeddings": 4096,
Some shards in the safetensors index have tensors stored as torch.uint8, which is incompatible with bnb.nn.Params4bit. Model fails to load with bitsandbytes quantizer. Not worth downloading until unsloth confirms they have fixed/reuploaded correctly.
Loading checkpoint shards: 0%| | 0/13 [00:00<?, ?it/s]Traceback (most recent call last): File "/home/petrus/AI/Transformers/llamaGOD.py", line 12, in <module> model = Llama4ForConditionalGeneration.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4400, in from_pretrained ) = cls._load_pretrained_model( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4793, in _load_pretrained_model caching_allocator_warmup(model_to_load, expanded_device_map, factor=2 if hf_quantizer is None else 4) File "/home/petrus/AI/Transformers/venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 5799, in caching_allocator_warmup _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: [enforce fail at alloc_cpu.cpp:118] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 108641793536 bytes. Error code 12 (Cannot allocate memory) Loading checkpoint shards: 0%| | 0/13 [00:00<?, ?it/s]
using the newest checkpoints with the following script:
from transformers import AutoProcessor, Llama4ForConditionalGeneration import torch #from transformers import modeling_utils # Disable the caching allocator warmup #modeling_utils.caching_allocator_warmup = lambda *args, **kwargs: None model_id = "/home/petrus/AI/LLaMa-4-Scout-A17B-100B/" processor = AutoProcessor.from_pretrained(model_id) model = Llama4ForConditionalGeneration.from_pretrained( model_id, attn_implementation="flex_attention", device_map="cpu", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, # max_seq_len=4096, ) url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png" messages = [ { "role": "user", "content": [ {"type": "image", "url": url1}, {"type": "image", "url": url2}, {"type": "text", "text": "Can you describe how these two images are similar, and how they differ?"}, ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate( **inputs, max_new_tokens=256, ) response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0] print(response) print(outputs[0])
edit:
also changed the following in config.json:
"max_position_embeddings": 4096,
Some shards in the safetensors index have tensors stored as torch.uint8, which is incompatible with bnb.nn.Params4bit. Model fails to load with bitsandbytes quantizer. Not worth downloading until unsloth confirms they have fixed/reuploaded correctly.
Currently does not work on any framework except Unsloth. There's a huge issue with 4bit combability for Llama 4. Will announce more details and official support tomorrow
@shimmyshimmer Any update on 4-bit working with vLLM ? TIA :)
@shimmyshimmer any updates? :(((
@shimmyshimmer Any update on 4-bit working with vLLM ? TIA :)
@shimmyshimmer any updates? :(((
Sorry guys taking longer than expected because of the bugs fixes etc definitely by this week