4 bit Quantization of GLM-4.1V-9B
Quantization Methods
from transformers import AutoProcessor, Glm4vForConditionalGeneration
import torch
from transformers import BitsAndBytesConfig
MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
SAVE_PATH = "./Q4"
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = Glm4vForConditionalGeneration.from_pretrained(
pretrained_model_name_or_path=MODEL_PATH,
quantization_config=quant_config,
device_map="auto",
)
model.save_pretrained(SAVE_PATH)
processor.save_pretrained(SAVE_PATH)
Serve the model with Transformers (OpenAI-chat compatible)
#install necessary libraries
pip install fastapi uvicorn pydantic
#Run the script to set up the server
from transformers import AutoProcessor, Glm4vForConditionalGeneration
from fastapi import FastAPI
from pydantic import BaseModel
import torch
app = FastAPI()
model = Glm4vForConditionalGeneration.from_pretrained(
"/home/ross/Q4",
device_map="auto",
quantization_config={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16}
)
processor = AutoProcessor.from_pretrained("/home/ross/Q4", use_fast=True)
class CompletionRequest(BaseModel):
prompt: str
max_tokens: int = 50
@app.post("/v1/completions")
async def generate_completion(request: CompletionRequest):
messages = [{"role": "user", "content": [{"type": "text", "text": request.prompt}]}]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
inputs = {k: v.to(torch.bfloat16) if v.dtype.is_floating_point else v for k, v in inputs.items()}
generated_ids = model.generate(**inputs, max_new_tokens=request.max_tokens)
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
return {"choices": [{"text": output_text}]}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
Test the server
curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
"prompt": "Who are you",
"max_tokens": 50
}'
- Downloads last month
- 0
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support