# # from fastapi import FastAPI, HTTPException # # from pydantic import BaseModel # # from transformers import AutoModelForCausalLM, AutoTokenizer # # from typing import List # # import torch # # app = FastAPI(title="Language Model API") # # # Model configuration # # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct" # # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # # # Initialize model and tokenizer # # try: # # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) # # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE) # # except Exception as e: # # raise RuntimeError(f"Failed to load model: {str(e)}") # # class ChatMessage(BaseModel): # # role: str # # content: str # # class ChatRequest(BaseModel): # # messages: List[ChatMessage] # # max_new_tokens: int = 50 # # temperature: float = 0.2 # # top_p: float = 0.9 # # @app.post("/generate") # # async def generate_response(request: ChatRequest): # # try: # # # Convert messages to the format expected by the model # # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] # # # Prepare input # # input_text = tokenizer.apply_chat_template(messages, tokenize=False) # # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE) # # # Generate response # # outputs = model.generate( # # inputs, # # max_new_tokens=request.max_new_tokens, # # temperature=request.temperature, # # top_p=request.top_p, # # do_sample=True # # ) # # # Decode and return response # # response_text = tokenizer.decode(outputs[0]) # # return { # # "generated_text": response_text # # } # # except Exception as e: # # raise HTTPException(status_code=500, detail=str(e)) # # if __name__ == "__main__": # # import uvicorn # # uvicorn.run(app, host="0.0.0.0", port=7860) # from fastapi import FastAPI, HTTPException # from pydantic import BaseModel # from typing import List, Dict # import transformers # import torch # app = FastAPI(title="LLaMA API") # # Initialize the model and pipeline at startup # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" # pipeline = transformers.pipeline( # "text-generation", # model=model_id, # model_kwargs={"torch_dtype": torch.bfloat16}, # device_map="auto", # ) # class Message(BaseModel): # role: str # content: str # class ChatRequest(BaseModel): # messages: List[Message] # max_new_tokens: int = 256 # class ChatResponse(BaseModel): # generated_text: str # @app.post("/generate", response_model=ChatResponse) # async def chat(request: ChatRequest): # try: # outputs = pipeline( # [{"role": msg.role, "content": msg.content} for msg in request.messages], # max_new_tokens=request.max_new_tokens, # ) # # Extract the last generated message # generated_text = outputs[0]["generated_text"][-1] # return ChatResponse(generated_text=generated_text) # except Exception as e: # raise HTTPException(status_code=500, detail=str(e)) # # Health check endpoint # @app.get("/") # async def health_check(): # return {"status": "healthy"} # if __name__ == "__main__": # import uvicorn # uvicorn.run(app, host="0.0.0.0", port=8000) # # from fastapi import FastAPI, HTTPException # # from pydantic import BaseModel # # from transformers import AutoModelForCausalLM, AutoTokenizer # # from typing import List # # import torch # # app = FastAPI(title="Language Model API") # # # Model configuration # # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct" # # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # # # Initialize model and tokenizer # # try: # # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) # # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE) # # except Exception as e: # # raise RuntimeError(f"Failed to load model: {str(e)}") # # class ChatMessage(BaseModel): # # role: str # # content: str # # class ChatRequest(BaseModel): # # messages: List[ChatMessage] # # max_new_tokens: int = 50 # # temperature: float = 0.2 # # top_p: float = 0.9 # # @app.post("/generate") # # async def generate_response(request: ChatRequest): # # try: # # # Convert messages to the format expected by the model # # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] # # # Prepare input # # input_text = tokenizer.apply_chat_template(messages, tokenize=False) # # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE) # # # Generate response # # outputs = model.generate( # # inputs, # # max_new_tokens=request.max_new_tokens, # # temperature=request.temperature, # # top_p=request.top_p, # # do_sample=True # # ) # # # Decode and return response # # response_text = tokenizer.decode(outputs[0]) # # return { # # "generated_text": response_text # # } # # except Exception as e: # # raise HTTPException(status_code=500, detail=str(e)) # # if __name__ == "__main__": # # import uvicorn # # uvicorn.run(app, host="0.0.0.0", port=7860) from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Dict import transformers import torch app = FastAPI(title="LLaMA API") # Initialize the model and pipeline at startup model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" pipeline = transformers.pipeline( "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", ) class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): messages: List[Message] max_new_tokens: int = 256 class ChatResponse(BaseModel): generated_text: str @app.post("/generate", response_model=ChatResponse) async def chat(request: ChatRequest): try: outputs = pipeline( [{"role": msg.role, "content": msg.content} for msg in request.messages], max_new_tokens=request.max_new_tokens, ) # Extract the last generated message generated_text = outputs[0]["generated_text"][-1] return ChatResponse(generated_text=generated_text) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # Health check endpoint @app.get("/") async def health_check(): return {"status": "healthy"} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)