cache_dir with ssd super slow and once loaded in nvidia quadro P4000 with 4bit return nothing Image is made of multiple images inside with timestamp
Hello, I just want to test google/gemma-3-4b-it with 4bit on my local pc with Nvidia Quadro P4000 (8GB) it enough space to load and have room for images , api gemini 2.5 is great but too expensive for now, I got the model loading with cache_dir event if it is super slow, but I get no text generated. I have been working with Gemini console to try to debug and find a solution with code but still stuck. I am not getting any errors. Model return not text any idea?
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [{"type": "text", "text": "tell me what you see"}] #use prompt when this work
}
]
for img in pil_images:
messages[1]["content"].append({"type": "image", "image": self._resize_image_to_896x896(img)})
break # debug just one imae for now.
print(f"\nDEBUG: Messages structure for processor: {messages}")
try:
print("\nDEBUG: Attempting to call self.processor with structured messages and images...")
processed_inputs = self.processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(self.model.device, dtype=torch.bfloat16)
print("DEBUG: self.processor call successful.")
generate_start = time.time()
print("\Generating result...")
# Get the length of the input to extract only the generated tokens later
input_len = processed_inputs["input_ids"].shape[-1]
max_new_tokens = 32000
# Generate text
with torch.inference_mode():
print(f"pixel_values: {processed_inputs['pixel_values'].shape}")
generation = self.model.generate(
**processed_inputs,
min_new_tokens=20, # <-- Add this! Force at least 20 tokens
max_new_tokens=max_new_tokens,
do_sample=False,
# You might also want to add `temperature=temperature` if you want to use the passed argument
num_beams=1, # Explicitly set for greedy decoding if not using do_sample=True
# pad_token_id=self.processor.tokenizer.eos_token_id, # Good practice for text generation
# eos_token_id=self.processor.tokenizer.eos_token_id, # Good practice for text generation
eos_token_id=None, # <-- SET EOS_TOKEN_ID TO NONE FOR DEBUGGING
)
print(f"\nDEBUG: Input length (input_len): {input_len}")
print(f"DEBUG: Raw generation tensor shape: {generation.shape}")
# Decode the *entire* generated tensor (including input) to see everything
raw_decoded_output = self.processor.decode(generation[0], skip_special_tokens=False)
print(f"DEBUG: Raw decoded output (with special tokens): \n{raw_decoded_output}")
# Extract only the generated tokens
generation = generation[0][input_len:]
print(f"DEBUG: Sliced generation tensor shape: {generation.shape}")
# Now decode the sliced part
generated_text = self.processor.decode(generation, skip_special_tokens=True)
print(f"DEBUG: Final generated_text before post-processing: \n'{generated_text}'")