Update README.md
Browse files
README.md
CHANGED
@@ -142,7 +142,7 @@ from llama_cpp import Llama
|
|
142 |
|
143 |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
144 |
llm = Llama(
|
145 |
-
model_path="./
|
146 |
n_ctx=32768, # The max sequence length to use - note that longer sequence lengths require much more resources
|
147 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
148 |
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
@@ -158,7 +158,7 @@ output = llm(
|
|
158 |
|
159 |
# Chat Completion API
|
160 |
|
161 |
-
llm = Llama(model_path="./
|
162 |
llm.create_chat_completion(
|
163 |
messages = [
|
164 |
{"role": "system", "content": "You are a story writing assistant."},
|
|
|
142 |
|
143 |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
144 |
llm = Llama(
|
145 |
+
model_path="./mixtral-8x22b-v0.1-instruct-oh-Q8_0-00001-of-00004.gguf", # Download the model file first
|
146 |
n_ctx=32768, # The max sequence length to use - note that longer sequence lengths require much more resources
|
147 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
148 |
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
|
|
158 |
|
159 |
# Chat Completion API
|
160 |
|
161 |
+
llm = Llama(model_path="./mixtral-8x22b-v0.1-instruct-oh-Q8_0-00001-of-00004.gguf", chat_format="llama-2") # Set chat_format according to the model you are using
|
162 |
llm.create_chat_completion(
|
163 |
messages = [
|
164 |
{"role": "system", "content": "You are a story writing assistant."},
|