Spaces:

omaryasserhassan
/

llm_server

Sleeping

llm_server / app.py

Update app.py

c06971b verified 2 months ago

1.33 kB

	import os
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
	FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

	CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
	os.makedirs(CACHE_DIR, exist_ok=True)

	app = FastAPI()
	_model = None

	def get_model():
	global _model
	if _model is not None:
	return _model
	local_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	cache_dir=CACHE_DIR,
	local_dir_use_symlinks=False,
	)
	_model = Llama(
	model_path=local_path,
	n_ctx=2048,
	n_threads=os.cpu_count() or 2,
	n_batch=256,
	verbose=False
	)
	return _model

	class PromptRequest(BaseModel):
	prompt: str
	max_tokens: int = 256
	temperature: float = 0.7

	@app.post("/generate")
	def generate_text(req: PromptRequest):
	try:
	model = get_model()
	output = model(
	req.prompt,
	max_tokens=req.max_tokens,
	temperature=req.temperature,
	stop=["</s>"]
	)
	return {"ok": True, "response": output["choices"][0]["text"]}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))