Spaces:

hackeracademy
/

foundation-sec-llm-api

Sleeping

App Files Files Community

hackeracademy commited on 17 days ago

Commit

5d8cca4

1 Parent(s): 222587e

Use Gradio wrapper to satisfy HF Spaces port requirement

Browse files

Files changed (3) hide show

Dockerfile +1 -4
app.py +17 -50
requirements.txt +1 -1

Dockerfile CHANGED Viewed

@@ -1,11 +1,8 @@
 FROM python:3.11-alpine
-RUN apk add --no-cache build-base libffi-dev cmake git curl
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 7860
 CMD ["python", "-u", "app.py"]

 FROM python:3.11-alpine
+RUN apk add --no-cache build-base libffi-dev cmake git
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 7860
 CMD ["python", "-u", "app.py"]

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
-import os, logging, requests, threading, uvicorn
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
 from llama_cpp import Llama
 MODEL_URL = (
@@ -9,54 +7,23 @@ MODEL_URL = (
 )
 MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
-logging.basicConfig(level=logging.INFO)
-# --- tiny “alive” route so HF sees the container immediately ---
-app = FastAPI()
-@app.get("/")
-def root():
-    return {"status": "loading model …"}
-# --- download once, in a background thread so / stays alive ---
-def download_model():
-    if not os.path.exists(MODEL_PATH):
-        logging.info("Downloading model …")
-        with requests.get(MODEL_URL, stream=True, timeout=30) as r:
-            r.raise_for_status()
-            with open(MODEL_PATH, "wb") as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-        logging.info("Download finished.")
-    logging.info("Loading model into RAM …")
-    app.state.llm = Llama(
-        model_path=MODEL_PATH,
-        n_ctx=4096,
-        n_threads=os.cpu_count(),
-        verbose=False
-    )
-    logging.info("Model ready.")
-threading.Thread(target=download_model, daemon=True).start()
-class ChatRequest(BaseModel):
-    messages: list[dict]
-    max_tokens: int = 256
-    temperature: float = 0.7
-@app.post("/v1/chat/completions")
-def chat(req: ChatRequest):
-    if not hasattr(app.state, "llm"):
-        raise HTTPException(
-            status_code=503,
-            detail="Model still loading, please retry in ~30 s"
-        )
-    return app.state.llm.create_chat_completion(
-        messages=req.messages,
-        max_tokens=req.max_tokens,
-        temperature=req.temperature
-    )
-# --- start uvicorn on port 7860 (HF expects this) ---
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+import os, gradio as gr
 from llama_cpp import Llama
 MODEL_URL = (
 )
 MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
+# download once (Gradio blocks until the file exists)
+if not os.path.exists(MODEL_PATH):
+    import requests, time
+    with requests.get(MODEL_URL, stream=True) as r:
+        r.raise_for_status()
+        with open(MODEL_PATH, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+llm = Llama(model_path=MODEL_PATH, n_ctx=4096, verbose=False)
+def chat_fn(message, history):
+    messages = [{"role": "user", "content": message}]
+    out = llm.create_chat_completion(messages=messages, max_tokens=256, temperature=0.7)
+    return out["choices"][0]["message"]["content"]
+demo = gr.ChatInterface(chat_fn, title="Foundation-Sec-8B")
+# expose on 0.0.0.0:7860 (Gradio default)
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 fastapi==0.110
-uvicorn[standard]==0.29
 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 llama-cpp-python==0.2.90
 requests             # <-- add this line

 fastapi==0.110
 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 llama-cpp-python==0.2.90
 requests             # <-- add this line
+gradio==4.43.0            # or latest