Spaces:
Configuration error
Configuration error
Reorganized inference endpoints
Browse files
inference_endpoints/{deepseekR1-qwen-32B.py β modal/deepseekR1-qwen-32B.py}
RENAMED
File without changes
|
inference_endpoints/{llama3-1-8B-instruct.py β modal/llama3-1-8B-instruct.py}
RENAMED
File without changes
|
inference_endpoints/{qwen2-5-coder-14B-instruct.py β modal/qwen2-5-coder-14B-instruct.py}
RENAMED
@@ -63,12 +63,11 @@ def serve():
|
|
63 |
"--uvicorn-log-level=info",
|
64 |
MODEL_NAME,
|
65 |
"--served-model-name", MODEL_NAME,
|
66 |
-
"--
|
67 |
-
"--max-model-len", "16000",
|
68 |
"--host", "0.0.0.0",
|
69 |
"--port", str(VLLM_PORT),
|
70 |
"--api-key",os.environ["MODAL_TOKEN_SECRET"],
|
71 |
-
"--enforce-eager"
|
72 |
]
|
73 |
|
74 |
subprocess.Popen(" ".join(cmd), shell=True)
|
|
|
63 |
"--uvicorn-log-level=info",
|
64 |
MODEL_NAME,
|
65 |
"--served-model-name", MODEL_NAME,
|
66 |
+
"--max-model-len", "32000",
|
|
|
67 |
"--host", "0.0.0.0",
|
68 |
"--port", str(VLLM_PORT),
|
69 |
"--api-key",os.environ["MODAL_TOKEN_SECRET"],
|
70 |
+
"--enforce-eager" # Don't compile CUDA graph, saves memory and cold start time
|
71 |
]
|
72 |
|
73 |
subprocess.Popen(" ".join(cmd), shell=True)
|