from fastapi import FastAPI, Query from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os app = FastAPI() # Create offload folder if not exists os.makedirs("./offload", exist_ok=True) # Load tokenizer and model with offload_folder to prevent device_map error tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base") model = AutoModelForCausalLM.from_pretrained( "deepseek-ai/deepseek-llm-7b-base", torch_dtype=torch.float16, device_map="auto", offload_folder="./offload" ) @app.get("/") def home(): return { "message": "✅ DeepSeek LLM is running. Use endpoint /ask?prompt=your+question" } @app.get("/ask") def ask(prompt: str = Query(..., description="Your input prompt")): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7) reply = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": reply}