GodfreyOwino's picture
Create app.py
55f7a10 verified
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import logging
import os
from typing import Optional
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="DeepSeek R1 Chat API",
description="DeepSeek R1 model hosted on Hugging Face Spaces",
version="1.0.0"
)
# Request/Response models
class ChatRequest(BaseModel):
message: str
max_length: Optional[int] = 512
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
class ChatResponse(BaseModel):
response: str
status: str
# Global variables for model and tokenizer
model = None
tokenizer = None
@app.on_event("startup")
async def load_model():
"""Load the DeepSeek model on startup"""
global model, tokenizer
try:
logger.info("Loading DeepSeek R1 model...")
# Use a smaller DeepSeek model that fits in Spaces
model_name = "deepseek-ai/deepseek-r1-distill-qwen-1.5b"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
padding_side="left"
)
# Add pad token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model with appropriate settings for Spaces
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
low_cpu_mem_usage=True
)
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Error loading model: {str(e)}")
raise e
@app.get("/")
async def root():
"""Health check endpoint"""
return {
"message": "DeepSeek R1 Chat API is running!",
"status": "healthy",
"model_loaded": model is not None
}
@app.get("/health")
async def health_check():
"""Detailed health check"""
return {
"status": "healthy",
"model_loaded": model is not None,
"tokenizer_loaded": tokenizer is not None,
"cuda_available": torch.cuda.is_available(),
"device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0
}
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""Chat endpoint for DeepSeek model"""
if model is None or tokenizer is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
try:
# Prepare the input
prompt = f"User: {request.message}\nAssistant:"
# Tokenize input
inputs = tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=1024
)
# Move to appropriate device
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_length,
temperature=request.temperature,
top_p=request.top_p,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1
)
# Decode response
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
if "Assistant:" in full_response:
response = full_response.split("Assistant:")[-1].strip()
else:
response = full_response[len(prompt):].strip()
return ChatResponse(response=response, status="success")
except Exception as e:
logger.error(f"Error during generation: {str(e)}")
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
@app.post("/generate")
async def generate(request: ChatRequest):
"""Alternative generation endpoint"""
return await chat(request)
@app.get("/model-info")
async def model_info():
"""Get model information"""
if model is None:
return {"status": "Model not loaded"}
return {
"model_name": "deepseek-ai/deepseek-r1-distill-qwen-1.5b",
"model_type": type(model).__name__,
"tokenizer_type": type(tokenizer).__name__,
"vocab_size": tokenizer.vocab_size if tokenizer else None,
"device": str(next(model.parameters()).device) if model else None
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)