gperdrizet commited on
Commit
a5548e7
Β·
verified Β·
1 Parent(s): 2db495f

Reorganized inference endpoints

Browse files
inference_endpoints/{deepseekR1-qwen-32B.py β†’ modal/deepseekR1-qwen-32B.py} RENAMED
File without changes
inference_endpoints/{llama3-1-8B-instruct.py β†’ modal/llama3-1-8B-instruct.py} RENAMED
File without changes
inference_endpoints/{qwen2-5-coder-14B-instruct.py β†’ modal/qwen2-5-coder-14B-instruct.py} RENAMED
@@ -63,12 +63,11 @@ def serve():
63
  "--uvicorn-log-level=info",
64
  MODEL_NAME,
65
  "--served-model-name", MODEL_NAME,
66
- "--tensor-parallel-size", "2",
67
- "--max-model-len", "16000",
68
  "--host", "0.0.0.0",
69
  "--port", str(VLLM_PORT),
70
  "--api-key",os.environ["MODAL_TOKEN_SECRET"],
71
- "--enforce-eager"
72
  ]
73
 
74
  subprocess.Popen(" ".join(cmd), shell=True)
 
63
  "--uvicorn-log-level=info",
64
  MODEL_NAME,
65
  "--served-model-name", MODEL_NAME,
66
+ "--max-model-len", "32000",
 
67
  "--host", "0.0.0.0",
68
  "--port", str(VLLM_PORT),
69
  "--api-key",os.environ["MODAL_TOKEN_SECRET"],
70
+ "--enforce-eager" # Don't compile CUDA graph, saves memory and cold start time
71
  ]
72
 
73
  subprocess.Popen(" ".join(cmd), shell=True)