from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer from typing import List import torch app = FastAPI(title="Language Model API") # Model configuration CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Initialize model and tokenizer try: tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE) except Exception as e: raise RuntimeError(f"Failed to load model: {str(e)}") class ChatMessage(BaseModel): role: str content: str class ChatRequest(BaseModel): messages: List[ChatMessage] max_new_tokens: int = 50 temperature: float = 0.2 top_p: float = 0.9 @app.post("/generate") async def generate_response(request: ChatRequest): try: # Convert messages to the format expected by the model messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] # Prepare input input_text = tokenizer.apply_chat_template(messages, tokenize=False) inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE) # Generate response outputs = model.generate( inputs, max_new_tokens=request.max_new_tokens, temperature=request.temperature, top_p=request.top_p, do_sample=True ) # Decode and return response response_text = tokenizer.decode(outputs[0]) return { "generated_text": response_text } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)