Update README.md
Browse files
README.md
CHANGED
@@ -8,7 +8,7 @@ tags:
|
|
8 |
- sparse
|
9 |
---
|
10 |
|
11 |
-
## llama2.c-stories110M-
|
12 |
This repo contains model files for [llama2.c 110M tinystories](https://huggingface.co/Xenova/llama2.c-stories110M) optimized for [NM-vLLM](https://github.com/neuralmagic/nm-vllm), a high-throughput serving engine for compressed LLMs.
|
13 |
|
14 |
This model was pruned with [SparseGPT](https://arxiv.org/abs/2301.00774), using [SparseML](https://github.com/neuralmagic/sparseml).
|
@@ -24,8 +24,6 @@ from vllm import LLM, SamplingParams
|
|
24 |
|
25 |
model = LLM("nm-testing/llama2.c-stories110M-pruned2.4", sparsity="sparse_w16a16")
|
26 |
prompt = "My name is "
|
27 |
-
formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
28 |
-
|
29 |
sampling_params = SamplingParams(max_tokens=100,temperature=0)
|
30 |
outputs = model.generate(prompt, sampling_params=sampling_params)
|
31 |
print(outputs[0].outputs[0].text)
|
|
|
8 |
- sparse
|
9 |
---
|
10 |
|
11 |
+
## llama2.c-stories110M-pruned2.4
|
12 |
This repo contains model files for [llama2.c 110M tinystories](https://huggingface.co/Xenova/llama2.c-stories110M) optimized for [NM-vLLM](https://github.com/neuralmagic/nm-vllm), a high-throughput serving engine for compressed LLMs.
|
13 |
|
14 |
This model was pruned with [SparseGPT](https://arxiv.org/abs/2301.00774), using [SparseML](https://github.com/neuralmagic/sparseml).
|
|
|
24 |
|
25 |
model = LLM("nm-testing/llama2.c-stories110M-pruned2.4", sparsity="sparse_w16a16")
|
26 |
prompt = "My name is "
|
|
|
|
|
27 |
sampling_params = SamplingParams(max_tokens=100,temperature=0)
|
28 |
outputs = model.generate(prompt, sampling_params=sampling_params)
|
29 |
print(outputs[0].outputs[0].text)
|