nm-research commited on
Commit
a6adb5f
·
verified ·
1 Parent(s): 4962ff6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -22
README.md CHANGED
@@ -34,32 +34,34 @@ This model was obtained by quantizing the weights of [google/gemma-3-4b-it](http
34
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
35
 
36
  ```python
37
- from vllm.assets.image import ImageAsset
38
  from vllm import LLM, SamplingParams
 
 
39
 
40
- # prepare model
41
- llm = LLM(
42
- model="nm-testing/gemma-3-4b-it-quantized.w4a16",
43
- trust_remote_code=True,
44
- max_model_len=4096,
45
- max_num_seqs=2,
46
- )
47
 
48
- # prepare inputs
49
- question = "What is the content of this image?"
50
- inputs = {
51
- "prompt": f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n",
52
- "multi_modal_data": {
53
- "image": ImageAsset("cherry_blossom").pil_image.convert("RGB")
54
- },
55
- }
56
-
57
- # generate response
58
- print("========== SAMPLE GENERATION ==============")
 
 
 
 
 
59
  outputs = llm.generate(inputs, SamplingParams(temperature=0.2, max_tokens=64))
60
- print(f"PROMPT : {outputs[0].prompt}")
61
- print(f"RESPONSE: {outputs[0].outputs[0].text}")
62
- print("==========================================")
 
63
  ```
64
 
65
  vLLM also supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
 
34
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
35
 
36
  ```python
 
37
  from vllm import LLM, SamplingParams
38
+ from vllm.assets.image import ImageAsset
39
+ from transformers import AutoProcessor
40
 
41
+ # Define model name once
42
+ model_name = "RedHatAI/gemma-3-4b-it-quantized.w8a8"
 
 
 
 
 
43
 
44
+ # Load image and processor
45
+ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
46
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
47
+
48
+ # Build multimodal prompt
49
+ chat = [
50
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What is the content of this image?"}]},
51
+ {"role": "assistant", "content": []}
52
+ ]
53
+ prompt = processor.apply_chat_template(chat, add_generation_prompt=True)
54
+
55
+ # Initialize model
56
+ llm = LLM(model=model_name, trust_remote_code=True)
57
+
58
+ # Run inference
59
+ inputs = {"prompt": prompt, "multi_modal_data": {"image": [image]}}
60
  outputs = llm.generate(inputs, SamplingParams(temperature=0.2, max_tokens=64))
61
+
62
+ # Display result
63
+ print("RESPONSE:", outputs[0].outputs[0].text)
64
+
65
  ```
66
 
67
  vLLM also supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.