Update README.md
Browse files
README.md
CHANGED
@@ -75,25 +75,27 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
|
|
75 |
|
76 |
```python
|
77 |
from vllm import LLM, SamplingParams
|
78 |
-
from transformers import
|
79 |
|
80 |
-
model_id = "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-
|
81 |
number_gpus = 1
|
82 |
|
83 |
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
|
|
|
84 |
|
85 |
-
|
86 |
|
87 |
-
|
88 |
|
89 |
llm = LLM(model=model_id, tensor_parallel_size=number_gpus)
|
90 |
|
91 |
-
outputs = llm.generate(
|
92 |
|
93 |
generated_text = outputs[0].outputs[0].text
|
94 |
print(generated_text)
|
95 |
```
|
96 |
|
|
|
97 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
98 |
|
99 |
## Creation
|
|
|
75 |
|
76 |
```python
|
77 |
from vllm import LLM, SamplingParams
|
78 |
+
from transformers import AutoProcessor
|
79 |
|
80 |
+
model_id = "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
|
81 |
number_gpus = 1
|
82 |
|
83 |
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
|
84 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
85 |
|
86 |
+
messages = [{"role": "user", "content": "Give me a short introduction to large language model."}]
|
87 |
|
88 |
+
prompts = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
89 |
|
90 |
llm = LLM(model=model_id, tensor_parallel_size=number_gpus)
|
91 |
|
92 |
+
outputs = llm.generate(prompts, sampling_params)
|
93 |
|
94 |
generated_text = outputs[0].outputs[0].text
|
95 |
print(generated_text)
|
96 |
```
|
97 |
|
98 |
+
|
99 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
100 |
|
101 |
## Creation
|