Tim Luka Horstmann commited on
Commit
44afc53
Β·
1 Parent(s): 46825d7

Add streaming

Browse files
Files changed (1) hide show
  1. llm_server.py +18 -3
llm_server.py CHANGED
@@ -1,6 +1,6 @@
1
  import time
2
  from fastapi import FastAPI, HTTPException
3
- from fastapi.responses import JSONResponse
4
  from llama_cpp import Llama
5
  from huggingface_hub import login, hf_hub_download
6
  import logging
@@ -136,8 +136,23 @@ async def keep_model_warm():
136
  # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
137
  @app.post("/v1/chat/completions")
138
  async def chat(req: dict):
139
- if req.get("model") != "llama-cpp":
140
- raise HTTPException(404, "Model not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  resp = llm.create_chat_completion(
142
  messages=req["messages"],
143
  max_tokens=req.get("max_tokens", 256),
 
1
  import time
2
  from fastapi import FastAPI, HTTPException
3
+ from fastapi.responses import StreamingResponse, JSONResponse
4
  from llama_cpp import Llama
5
  from huggingface_hub import login, hf_hub_download
6
  import logging
 
136
  # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
137
  @app.post("/v1/chat/completions")
138
  async def chat(req: dict):
139
+ # if the client (Qwen-Agent) asked for a stream, proxy the SSE events:
140
+ if req.get("stream", False):
141
+ async def event_generator():
142
+ # llama_cpp will now yield tokens/chunks
143
+ for chunk in llm.create_chat_completion(
144
+ messages=req["messages"],
145
+ max_tokens=req.get("max_tokens", 256),
146
+ temperature=req.get("temperature", 0.7),
147
+ top_p=req.get("top_p", 1.0),
148
+ stream=True,
149
+ ):
150
+ # SSE format: data: <json>\n\n
151
+ yield f"data: {json.dumps(chunk)}\n\n"
152
+ return StreamingResponse(event_generator(),
153
+ media_type="text/event-stream")
154
+
155
+ # otherwise, fall back to the usual non-streaming JSON response
156
  resp = llm.create_chat_completion(
157
  messages=req["messages"],
158
  max_tokens=req.get("max_tokens", 256),