Spaces:

omaryasserhassan
/

llm_server

Sleeping

App Files Files Community

omaryasserhassan commited on Aug 14

Commit

c06971b

verified ·

1 Parent(s): e844fc3

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -35

app.py CHANGED Viewed

@@ -1,71 +1,52 @@
 import os
-import traceback
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
-from ctransformers import LLM
-# --- Config ---
 REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
-FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"  # Low quantization
-MODEL_TYPE = "llama"
-# --- Cache dir ---
 CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
-# --- FastAPI App ---
-app = FastAPI(title="Llama 3.2 3B Instruct API")
 _model = None
-# --- Load Model ---
 def get_model():
     global _model
     if _model is not None:
         return _model
-    local_file = hf_hub_download(
         repo_id=REPO_ID,
         filename=FILENAME,
         cache_dir=CACHE_DIR,
         local_dir_use_symlinks=False,
     )
-    _model = LLM(
-        model=local_file,     # direct file path
-        model_type=MODEL_TYPE,
-        gpu_layers=0,
-        threads=os.cpu_count() or 2
     )
     return _model
-# --- Request Schema ---
 class PromptRequest(BaseModel):
     prompt: str
-    max_new_tokens: int = 256
     temperature: float = 0.7
-# --- API Endpoint ---
 @app.post("/generate")
 def generate_text(req: PromptRequest):
     try:
         model = get_model()
         output = model(
             req.prompt,
-            max_new_tokens=req.max_new_tokens,
-            temperature=req.temperature
         )
-        return {"ok": True, "response": output}
-    except Exception as e:
-        tb = traceback.format_exc()
-        print("❌ ERROR in /generate:\n", tb)
-        raise HTTPException(status_code=500, detail={"error": str(e), "trace": tb})
-# --- Health Check ---
-@app.get("/health")
-def health():
-    try:
-        _ = get_model()
-        return {"ok": True}
     except Exception as e:
-        return {"ok": False, "error": str(e)}

 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
+FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
 CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
+app = FastAPI()
 _model = None
 def get_model():
     global _model
     if _model is not None:
         return _model
+    local_path = hf_hub_download(
         repo_id=REPO_ID,
         filename=FILENAME,
         cache_dir=CACHE_DIR,
         local_dir_use_symlinks=False,
     )
+    _model = Llama(
+        model_path=local_path,
+        n_ctx=2048,
+        n_threads=os.cpu_count() or 2,
+        n_batch=256,
+        verbose=False
     )
     return _model
 class PromptRequest(BaseModel):
     prompt: str
+    max_tokens: int = 256
     temperature: float = 0.7
 @app.post("/generate")
 def generate_text(req: PromptRequest):
     try:
         model = get_model()
         output = model(
             req.prompt,
+            max_tokens=req.max_tokens,
+            temperature=req.temperature,
+            stop=["</s>"]
         )
+        return {"ok": True, "response": output["choices"][0]["text"]}
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))