4 bit Quantization of GLM-4.1V-9B

Quantization Methods

from transformers import AutoProcessor, Glm4vForConditionalGeneration
import torch
from transformers import BitsAndBytesConfig

MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
SAVE_PATH = "./Q4"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = Glm4vForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    quantization_config=quant_config,
    device_map="auto",
)

model.save_pretrained(SAVE_PATH)
processor.save_pretrained(SAVE_PATH)

Serve the model with Transformers (OpenAI-chat compatible)

#install necessary libraries
pip install fastapi uvicorn pydantic
#Run the script to set up the server
from transformers import AutoProcessor, Glm4vForConditionalGeneration
from fastapi import FastAPI
from pydantic import BaseModel
import torch

app = FastAPI()
model = Glm4vForConditionalGeneration.from_pretrained(
    "/home/ross/Q4",
    device_map="auto",
    quantization_config={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16}
)
processor = AutoProcessor.from_pretrained("/home/ross/Q4", use_fast=True)

class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

@app.post("/v1/completions")
async def generate_completion(request: CompletionRequest):
    messages = [{"role": "user", "content": [{"type": "text", "text": request.prompt}]}]
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device)
    inputs = {k: v.to(torch.bfloat16) if v.dtype.is_floating_point else v for k, v in inputs.items()}
    generated_ids = model.generate(**inputs, max_new_tokens=request.max_tokens)
    output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
    return {"choices": [{"text": output_text}]}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

Test the server

curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
    "prompt": "Who are you",
    "max_tokens": 50
}'
Downloads last month
0
Safetensors
Model size
10.3B params
Tensor type
F32
F16
U8
Inference Providers NEW
This model isn't deployed by any Inference Provider. 馃檵 Ask for provider support

Model tree for Rainnighttram/GLM-4.1V-9B-Thinking-bnb-4bit

Quantized
(1)
this model