Spaces:

fartinalbania
/

st-chat-1

Runtime error

App Files Files Community

fartinalbania commited on Jun 1

Commit

a012b32

verified ·

1 Parent(s): 513261a

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -29

app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import logging
 import gradio as gr
@@ -21,14 +24,51 @@ app.add_middleware(
     allow_headers=["*"],
 )
-MODEL_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load model and tokenizer
 print("Loading model...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32).to(device)
-print("Model loaded successfully!")
 # PowerThought System Prompt
 POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
@@ -259,35 +299,51 @@ def generate_response(conversation_history, max_new_tokens=1500):
     try:
         messages = build_messages(conversation_history)
-        # Apply chat template
-        text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Tokenize
-        inputs = tokenizer(text, return_tensors="pt").to(device)
-        # Generate
-        with torch.no_grad():
-            generated_ids = model.generate(
-                **inputs,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 repetition_penalty=1.05,
-                pad_token_id=tokenizer.eos_token_id
             )
-        # Decode only the new tokens
-        generated_text = tokenizer.decode(
-            generated_ids[0][inputs.input_ids.shape[-1]:],
-            skip_special_tokens=True
-        )
-        return generated_text.strip()
     except Exception as e:
         logger.error(f"Generation error: {str(e)}")
@@ -305,7 +361,13 @@ async def chat_endpoint(request: ChatRequest):
 @app.get("/api/health")
 async def health_check():
-    return {"status": "healthy", "model": MODEL_ID}
 # Gradio interface function
 def gradio_chat(message, history):

+# PowerThought FastAPI Chat Server
+# Requirements: pip install fastapi transformers torch gradio uvicorn accelerate bitsandbytes
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 import logging
 import gradio as gr
     allow_headers=["*"],
 )
+MODEL_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load model and tokenizer with better error handling
 print("Loading model...")
+pipe = None  # Initialize pipeline variable
+try:
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True,
+        use_fast=True
+    )
+    # Add pad token if it doesn't exist
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto" if torch.cuda.is_available() else None,
+        trust_remote_code=True
+    )
+    print("Model loaded successfully!")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    print("Falling back to pipeline method...")
+    # Fallback to pipeline method
+    try:
+        pipe = pipeline(
+            "text-generation",
+            model=MODEL_ID,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True
+        )
+        tokenizer = pipe.tokenizer
+        model = pipe.model
+        print("Pipeline fallback loaded successfully!")
+    except Exception as e2:
+        print(f"Pipeline fallback also failed: {e2}")
+        raise Exception(f"Both loading methods failed: {e}, {e2}")
 # PowerThought System Prompt
 POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
     try:
         messages = build_messages(conversation_history)
+        # Check if we're using pipeline or direct model
+        if pipe is not None:
+            # Using pipeline method
+            response = pipe(
+                messages,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 repetition_penalty=1.05,
+                return_full_text=False
             )
+            return response[0]['generated_text'].strip()
+        else:
+            # Using direct model method
+            # Apply chat template
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            # Tokenize
+            inputs = tokenizer(text, return_tensors="pt").to(device)
+            # Generate
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    repetition_penalty=1.05,
+                    pad_token_id=tokenizer.eos_token_id
+                )
+            # Decode only the new tokens
+            generated_text = tokenizer.decode(
+                generated_ids[0][inputs.input_ids.shape[-1]:],
+                skip_special_tokens=True
+            )
+            return generated_text.strip()
     except Exception as e:
         logger.error(f"Generation error: {str(e)}")
 @app.get("/api/health")
 async def health_check():
+    loading_method = "pipeline" if pipe is not None else "direct"
+    return {
+        "status": "healthy",
+        "model": MODEL_ID,
+        "loading_method": loading_method,
+        "device": str(device)
+    }
 # Gradio interface function
 def gradio_chat(message, history):