pentagoniac
/

llamarine

@@ -104,7 +104,7 @@ llm = LLM(
     tensor_parallel_size=2,  # Adjust based on your GPU setup
     dtype="bfloat16",
     gpu_memory_utilization=0.95,  # Use 95% of GPU memory
-    max_model_len=120000  # Large context length for extended conversations
 )
 # Configure sampling

     tensor_parallel_size=2,  # Adjust based on your GPU setup
     dtype="bfloat16",
     gpu_memory_utilization=0.95,  # Use 95% of GPU memory
+    max_model_len=8192  # Large context length for extended conversations
 )
 # Configure sampling