Tim Luka Horstmann
commited on
Commit
Β·
44afc53
1
Parent(s):
46825d7
Add streaming
Browse files- llm_server.py +18 -3
llm_server.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import time
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
-
from fastapi.responses import JSONResponse
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import login, hf_hub_download
|
| 6 |
import logging
|
|
@@ -136,8 +136,23 @@ async def keep_model_warm():
|
|
| 136 |
# βββ OpenAIβcompatible endpoint βββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
@app.post("/v1/chat/completions")
|
| 138 |
async def chat(req: dict):
|
| 139 |
-
if
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
resp = llm.create_chat_completion(
|
| 142 |
messages=req["messages"],
|
| 143 |
max_tokens=req.get("max_tokens", 256),
|
|
|
|
| 1 |
import time
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from fastapi.responses import StreamingResponse, JSONResponse
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import login, hf_hub_download
|
| 6 |
import logging
|
|
|
|
| 136 |
# βββ OpenAIβcompatible endpoint βββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
@app.post("/v1/chat/completions")
|
| 138 |
async def chat(req: dict):
|
| 139 |
+
# if the client (Qwen-Agent) asked for a stream, proxy the SSE events:
|
| 140 |
+
if req.get("stream", False):
|
| 141 |
+
async def event_generator():
|
| 142 |
+
# llama_cpp will now yield tokens/chunks
|
| 143 |
+
for chunk in llm.create_chat_completion(
|
| 144 |
+
messages=req["messages"],
|
| 145 |
+
max_tokens=req.get("max_tokens", 256),
|
| 146 |
+
temperature=req.get("temperature", 0.7),
|
| 147 |
+
top_p=req.get("top_p", 1.0),
|
| 148 |
+
stream=True,
|
| 149 |
+
):
|
| 150 |
+
# SSE format: data: <json>\n\n
|
| 151 |
+
yield f"data: {json.dumps(chunk)}\n\n"
|
| 152 |
+
return StreamingResponse(event_generator(),
|
| 153 |
+
media_type="text/event-stream")
|
| 154 |
+
|
| 155 |
+
# otherwise, fall back to the usual non-streaming JSON response
|
| 156 |
resp = llm.create_chat_completion(
|
| 157 |
messages=req["messages"],
|
| 158 |
max_tokens=req.get("max_tokens", 256),
|