from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Model configuration MODEL_NAME = "deepseek-ai/DeepSeek-V3-Base" # Hugging Face model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load model and tokenizer try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True, revision="main" ).to(device) except Exception as e: print(f"Error loading model: {e}") raise # FastAPI app initialization app = FastAPI() # Input schema class Query(BaseModel): input_text: str @app.post("/predict") async def predict(query: Query): input_text = query.input_text if not input_text: raise HTTPException(status_code=400, detail="Input text cannot be empty.") inputs = tokenizer(input_text, return_tensors="pt").to(device) outputs = model.generate(inputs["input_ids"], max_new_tokens=50, temperature=0.7) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response}