cache_dir with ssd super slow and once loaded in nvidia quadro P4000 with 4bit return nothing Image is made of multiple images inside with timestamp

#52
by jef06 - opened

Hello, I just want to test google/gemma-3-4b-it with 4bit on my local pc with Nvidia Quadro P4000 (8GB) it enough space to load and have room for images , api gemini 2.5 is great but too expensive for now, I got the model loading with cache_dir event if it is super slow, but I get no text generated. I have been working with Gemini console to try to debug and find a solution with code but still stuck. I am not getting any errors. Model return not text any idea?
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [{"type": "text", "text": "tell me what you see"}] #use prompt when this work
}
]

    for img in pil_images:
        messages[1]["content"].append({"type": "image", "image": self._resize_image_to_896x896(img)})
        break # debug just one imae for now.

    print(f"\nDEBUG: Messages structure for processor: {messages}")

    try:
        print("\nDEBUG: Attempting to call self.processor with structured messages and images...")
        processed_inputs = self.processor.apply_chat_template(
            messages, 
            add_generation_prompt=True, 
            tokenize=True,
            return_dict=True, 
            return_tensors="pt"
        ).to(self.model.device, dtype=torch.bfloat16)
        print("DEBUG: self.processor call successful.")
        
        generate_start = time.time()
        print("\Generating result...")
        # Get the length of the input to extract only the generated tokens later
        input_len = processed_inputs["input_ids"].shape[-1]
        max_new_tokens = 32000
        # Generate text
        with torch.inference_mode():
            print(f"pixel_values: {processed_inputs['pixel_values'].shape}")
            generation = self.model.generate(
                **processed_inputs, 
                min_new_tokens=20, # <-- Add this! Force at least 20 tokens
                max_new_tokens=max_new_tokens, 
                do_sample=False,
                # You might also want to add `temperature=temperature` if you want to use the passed argument
                num_beams=1, # Explicitly set for greedy decoding if not using do_sample=True
                # pad_token_id=self.processor.tokenizer.eos_token_id, # Good practice for text generation
                # eos_token_id=self.processor.tokenizer.eos_token_id, # Good practice for text generation
                eos_token_id=None, # <-- SET EOS_TOKEN_ID TO NONE FOR DEBUGGING
            )

        print(f"\nDEBUG: Input length (input_len): {input_len}")
        print(f"DEBUG: Raw generation tensor shape: {generation.shape}")

        # Decode the *entire* generated tensor (including input) to see everything
        raw_decoded_output = self.processor.decode(generation[0], skip_special_tokens=False)
        print(f"DEBUG: Raw decoded output (with special tokens): \n{raw_decoded_output}")

        # Extract only the generated tokens
        generation = generation[0][input_len:]

        print(f"DEBUG: Sliced generation tensor shape: {generation.shape}")

        # Now decode the sliced part
        generated_text = self.processor.decode(generation, skip_special_tokens=True)
        print(f"DEBUG: Final generated_text before post-processing: \n'{generated_text}'")

Sign up or log in to comment