Spaces:

Luka512
/

Qwen3

Runtime error

Tim Luka Horstmann commited on May 22

Commit

44afc53

1 Parent(s): 46825d7

Add streaming

Files changed (1) hide show

llm_server.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import time
 from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
 from llama_cpp import Llama
 from huggingface_hub import login, hf_hub_download
 import logging
@@ -136,8 +136,23 @@ async def keep_model_warm():
 # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
 @app.post("/v1/chat/completions")
 async def chat(req: dict):
-    if req.get("model") != "llama-cpp":
-        raise HTTPException(404, "Model not found")
     resp = llm.create_chat_completion(
         messages=req["messages"],
         max_tokens=req.get("max_tokens", 256),

 import time
 from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
 from llama_cpp import Llama
 from huggingface_hub import login, hf_hub_download
 import logging
 # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
 @app.post("/v1/chat/completions")
 async def chat(req: dict):
+    # if the client (Qwen-Agent) asked for a stream, proxy the SSE events:
+    if req.get("stream", False):
+        async def event_generator():
+            # llama_cpp will now yield tokens/chunks
+            for chunk in llm.create_chat_completion(
+                messages=req["messages"],
+                max_tokens=req.get("max_tokens", 256),
+                temperature=req.get("temperature", 0.7),
+                top_p=req.get("top_p", 1.0),
+                stream=True,
+            ):
+                # SSE format: data: <json>\n\n
+                yield f"data: {json.dumps(chunk)}\n\n"
+        return StreamingResponse(event_generator(),
+                                 media_type="text/event-stream")
+    # otherwise, fall back to the usual non-streaming JSON response
     resp = llm.create_chat_completion(
         messages=req["messages"],
         max_tokens=req.get("max_tokens", 256),