omaryasserhassan commited on
Commit
c06971b
·
verified ·
1 Parent(s): e844fc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -35
app.py CHANGED
@@ -1,71 +1,52 @@
1
  import os
2
- import traceback
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
  from huggingface_hub import hf_hub_download
6
- from ctransformers import LLM
7
 
8
- # --- Config ---
9
  REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
10
- FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" # Low quantization
11
- MODEL_TYPE = "llama"
12
 
13
- # --- Cache dir ---
14
  CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
15
  os.makedirs(CACHE_DIR, exist_ok=True)
16
 
17
- # --- FastAPI App ---
18
- app = FastAPI(title="Llama 3.2 3B Instruct API")
19
  _model = None
20
 
21
- # --- Load Model ---
22
  def get_model():
23
  global _model
24
  if _model is not None:
25
  return _model
26
-
27
- local_file = hf_hub_download(
28
  repo_id=REPO_ID,
29
  filename=FILENAME,
30
  cache_dir=CACHE_DIR,
31
  local_dir_use_symlinks=False,
32
  )
33
-
34
- _model = LLM(
35
- model=local_file, # direct file path
36
- model_type=MODEL_TYPE,
37
- gpu_layers=0,
38
- threads=os.cpu_count() or 2
39
  )
40
  return _model
41
 
42
- # --- Request Schema ---
43
  class PromptRequest(BaseModel):
44
  prompt: str
45
- max_new_tokens: int = 256
46
  temperature: float = 0.7
47
 
48
- # --- API Endpoint ---
49
  @app.post("/generate")
50
  def generate_text(req: PromptRequest):
51
  try:
52
  model = get_model()
53
  output = model(
54
  req.prompt,
55
- max_new_tokens=req.max_new_tokens,
56
- temperature=req.temperature
 
57
  )
58
- return {"ok": True, "response": output}
59
- except Exception as e:
60
- tb = traceback.format_exc()
61
- print("❌ ERROR in /generate:\n", tb)
62
- raise HTTPException(status_code=500, detail={"error": str(e), "trace": tb})
63
-
64
- # --- Health Check ---
65
- @app.get("/health")
66
- def health():
67
- try:
68
- _ = get_model()
69
- return {"ok": True}
70
  except Exception as e:
71
- return {"ok": False, "error": str(e)}
 
1
  import os
 
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from huggingface_hub import hf_hub_download
5
+ from llama_cpp import Llama
6
 
 
7
  REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
8
+ FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
 
9
 
 
10
  CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
11
  os.makedirs(CACHE_DIR, exist_ok=True)
12
 
13
+ app = FastAPI()
 
14
  _model = None
15
 
 
16
  def get_model():
17
  global _model
18
  if _model is not None:
19
  return _model
20
+ local_path = hf_hub_download(
 
21
  repo_id=REPO_ID,
22
  filename=FILENAME,
23
  cache_dir=CACHE_DIR,
24
  local_dir_use_symlinks=False,
25
  )
26
+ _model = Llama(
27
+ model_path=local_path,
28
+ n_ctx=2048,
29
+ n_threads=os.cpu_count() or 2,
30
+ n_batch=256,
31
+ verbose=False
32
  )
33
  return _model
34
 
 
35
  class PromptRequest(BaseModel):
36
  prompt: str
37
+ max_tokens: int = 256
38
  temperature: float = 0.7
39
 
 
40
  @app.post("/generate")
41
  def generate_text(req: PromptRequest):
42
  try:
43
  model = get_model()
44
  output = model(
45
  req.prompt,
46
+ max_tokens=req.max_tokens,
47
+ temperature=req.temperature,
48
+ stop=["</s>"]
49
  )
50
+ return {"ok": True, "response": output["choices"][0]["text"]}
 
 
 
 
 
 
 
 
 
 
 
51
  except Exception as e:
52
+ raise HTTPException(status_code=500, detail=str(e))