Spaces:

Vaibhav-Singh
/

SmolLM2-135M

Runtime error

App Files Files Community

Vaibhav-Singh commited on Jan 21

Commit

25b680f

1 Parent(s): 495ee6e

new llm

Browse files

Files changed (1) hide show

app.py +94 -41

app.py CHANGED Viewed

@@ -1,63 +1,116 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from typing import List
 import torch
-app = FastAPI(title="Language Model API")
-# Model configuration
-CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Initialize model and tokenizer
-try:
-    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
-    model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)
-except Exception as e:
-    raise RuntimeError(f"Failed to load model: {str(e)}")
-class ChatMessage(BaseModel):
     role: str
     content: str
 class ChatRequest(BaseModel):
-    messages: List[ChatMessage]
-    max_new_tokens: int = 50
-    temperature: float = 0.2
-    top_p: float = 0.9
-@app.post("/generate")
-async def generate_response(request: ChatRequest):
     try:
-        # Convert messages to the format expected by the model
-        messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
-        # Prepare input
-        input_text = tokenizer.apply_chat_template(messages, tokenize=False)
-        inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)
-        # Generate response
-        outputs = model.generate(
-            inputs,
             max_new_tokens=request.max_new_tokens,
-            temperature=request.temperature,
-            top_p=request.top_p,
-            do_sample=True
         )
-        # Decode and return response
-        response_text = tokenizer.decode(outputs[0])
-        return {
-            "generated_text": response_text
-        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+# from fastapi import FastAPI, HTTPException
+# from pydantic import BaseModel
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+# from typing import List
+# import torch
+# app = FastAPI(title="Language Model API")
+# # Model configuration
+# CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct"
+# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# # Initialize model and tokenizer
+# try:
+#     tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
+#     model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)
+# except Exception as e:
+#     raise RuntimeError(f"Failed to load model: {str(e)}")
+# class ChatMessage(BaseModel):
+#     role: str
+#     content: str
+# class ChatRequest(BaseModel):
+#     messages: List[ChatMessage]
+#     max_new_tokens: int = 50
+#     temperature: float = 0.2
+#     top_p: float = 0.9
+# @app.post("/generate")
+# async def generate_response(request: ChatRequest):
+#     try:
+#         # Convert messages to the format expected by the model
+#         messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
+#         # Prepare input
+#         input_text = tokenizer.apply_chat_template(messages, tokenize=False)
+#         inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)
+#         # Generate response
+#         outputs = model.generate(
+#             inputs,
+#             max_new_tokens=request.max_new_tokens,
+#             temperature=request.temperature,
+#             top_p=request.top_p,
+#             do_sample=True
+#         )
+#         # Decode and return response
+#         response_text = tokenizer.decode(outputs[0])
+#         return {
+#             "generated_text": response_text
+#         }
+#     except Exception as e:
+#         raise HTTPException(status_code=500, detail=str(e))
+# if __name__ == "__main__":
+#     import uvicorn
+#     uvicorn.run(app, host="0.0.0.0", port=7860)
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from typing import List, Dict
+import transformers
 import torch
+app = FastAPI(title="LLaMA API")
+# Initialize the model and pipeline at startup
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device_map="auto",
+)
+class Message(BaseModel):
     role: str
     content: str
 class ChatRequest(BaseModel):
+    messages: List[Message]
+    max_new_tokens: int = 256
+class ChatResponse(BaseModel):
+    generated_text: str
+@app.post("/generate", response_model=ChatResponse)
+async def chat(request: ChatRequest):
     try:
+        outputs = pipeline(
+            [{"role": msg.role, "content": msg.content} for msg in request.messages],
             max_new_tokens=request.max_new_tokens,
         )
+        # Extract the last generated message
+        generated_text = outputs[0]["generated_text"][-1]
+        return ChatResponse(generated_text=generated_text)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+# Health check endpoint
+@app.get("/")
+async def health_check():
+    return {"status": "healthy"}
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)