Spaces:

fartinalbania
/

st-chat-1

Runtime error

App Files Files Community

fartinalbania commited on Jun 1

Commit

b3c00c9

verified ·

1 Parent(s): 73ad8cb

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -48

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # PowerThought FastAPI Chat Server
-# Requirements: pip install fastapi transformers torch gradio uvicorn accelerate bitsandbytes
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
@@ -24,51 +25,100 @@ app.add_middleware(
     allow_headers=["*"],
 )
-MODEL_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load model and tokenizer with better error handling
 print("Loading model...")
-pipe = None  # Initialize pipeline variable
-try:
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_ID,
-        trust_remote_code=True,
-        use_fast=True
-    )
-    # Add pad token if it doesn't exist
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None,
-        trust_remote_code=True
-    )
-    print("Model loaded successfully!")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    print("Falling back to pipeline method...")
-    # Fallback to pipeline method
     try:
         pipe = pipeline(
             "text-generation",
-            model=MODEL_ID,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True
         )
         tokenizer = pipe.tokenizer
         model = pipe.model
-        print("Pipeline fallback loaded successfully!")
-    except Exception as e2:
-        print(f"Pipeline fallback also failed: {e2}")
-        raise Exception(f"Both loading methods failed: {e}, {e2}")
 # PowerThought System Prompt
 POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
@@ -299,9 +349,9 @@ def generate_response(conversation_history, max_new_tokens=1500):
     try:
         messages = build_messages(conversation_history)
-        # Check if we're using pipeline or direct model
         if pipe is not None:
-            # Using pipeline method
             response = pipe(
                 messages,
                 max_new_tokens=max_new_tokens,
@@ -313,14 +363,26 @@ def generate_response(conversation_history, max_new_tokens=1500):
             )
             return response[0]['generated_text'].strip()
-        else:
-            # Using direct model method
-            # Apply chat template
-            text = tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
             # Tokenize
             inputs = tokenizer(text, return_tensors="pt").to(device)
@@ -334,7 +396,8 @@ def generate_response(conversation_history, max_new_tokens=1500):
                     temperature=0.7,
                     top_p=0.9,
                     repetition_penalty=1.05,
-                    pad_token_id=tokenizer.eos_token_id
                 )
             # Decode only the new tokens
@@ -344,6 +407,32 @@ def generate_response(conversation_history, max_new_tokens=1500):
             )
             return generated_text.strip()
     except Exception as e:
         logger.error(f"Generation error: {str(e)}")

 # PowerThought FastAPI Chat Server
+# Requirements: pip install fastapi transformers torch gradio uvicorn accelerate
+# Optional for GPU quantization: pip install bitsandbytes
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
     allow_headers=["*"],
 )
+MODEL_ID = "microsoft/DialoGPT-large"  # Fallback reliable model
+PREFERRED_MODEL = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit"  # Preferred but needs GPU
+FALLBACK_MODELS = [
+    "microsoft/DialoGPT-medium",
+    "microsoft/DialoGPT-small",
+    "gpt2"
+]
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Device detected: {device}")
+# Load model and tokenizer with multiple fallbacks
 print("Loading model...")
+pipe = None
+model = None
+tokenizer = None
+current_model = None
+def try_load_model(model_id, use_quantization=False):
+    """Try to load a specific model"""
+    try:
+        print(f"Attempting to load: {model_id}")
+        if use_quantization and torch.cuda.is_available():
+            # Try quantized version on GPU
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id,
+                trust_remote_code=True,
+                use_fast=True
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+        else:
+            # Try regular version
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id,
+                trust_remote_code=True
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                trust_remote_code=True
+            ).to(device)
+        # Add pad token if needed
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer, model, model_id
+    except Exception as e:
+        print(f"Failed to load {model_id}: {e}")
+        return None, None, None
+# Try preferred model first (with quantization if GPU available)
+if torch.cuda.is_available():
+    tokenizer, model, current_model = try_load_model(PREFERRED_MODEL, use_quantization=True)
+# If that failed, try regular DeepSeek
+if model is None:
+    tokenizer, model, current_model = try_load_model("deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", use_quantization=False)
+# If that failed, try fallback models
+if model is None:
+    for fallback_model in FALLBACK_MODELS:
+        tokenizer, model, current_model = try_load_model(fallback_model, use_quantization=False)
+        if model is not None:
+            break
+# Final fallback to pipeline method with GPT-2
+if model is None:
     try:
+        print("Using pipeline fallback with GPT-2...")
         pipe = pipeline(
             "text-generation",
+            model="gpt2",
+            tokenizer="gpt2"
         )
         tokenizer = pipe.tokenizer
         model = pipe.model
+        current_model = "gpt2"
+        print("Pipeline with GPT-2 loaded successfully!")
+    except Exception as e:
+        raise Exception(f"All loading methods failed. Last error: {e}")
+if model is not None:
+    MODEL_ID = current_model  # Update MODEL_ID to reflect what actually loaded
+    print(f"Successfully loaded: {MODEL_ID}")
+else:
+    raise Exception("Failed to load any model")
 # PowerThought System Prompt
 POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
     try:
         messages = build_messages(conversation_history)
+        # Check if we're using pipeline
         if pipe is not None:
+            # For pipeline method
             response = pipe(
                 messages,
                 max_new_tokens=max_new_tokens,
             )
             return response[0]['generated_text'].strip()
+        # For direct model method
+        try:
+            # Try chat template first (for modern models)
+            if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
+                text = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+            else:
+                # Fallback for older models (like DialoGPT, GPT-2)
+                text = ""
+                for msg in messages:
+                    if msg["role"] == "system":
+                        text += f"System: {msg['content']}\n\n"
+                    elif msg["role"] == "user":
+                        text += f"User: {msg['content']}\n"
+                    elif msg["role"] == "assistant":
+                        text += f"Assistant: {msg['content']}\n"
+                text += "Assistant: "
             # Tokenize
             inputs = tokenizer(text, return_tensors="pt").to(device)
                     temperature=0.7,
                     top_p=0.9,
                     repetition_penalty=1.05,
+                    pad_token_id=tokenizer.eos_token_id,
+                    eos_token_id=tokenizer.eos_token_id
                 )
             # Decode only the new tokens
             )
             return generated_text.strip()
+        except Exception as e:
+            logger.error(f"Chat template failed, using simple concatenation: {e}")
+            # Simple fallback - just concatenate the last user message with system prompt
+            full_text = f"{POWERTHOUGHT_SYSTEM_PROMPT}\n\nUser: {conversation_history[-1]['content']}\nAssistant: "
+            inputs = tokenizer(full_text, return_tensors="pt").to(device)
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    repetition_penalty=1.05,
+                    pad_token_id=tokenizer.eos_token_id
+                )
+            generated_text = tokenizer.decode(
+                generated_ids[0][inputs.input_ids.shape[-1]:],
+                skip_special_tokens=True
+            )
+            return generated_text.strip()
     except Exception as e:
         logger.error(f"Generation error: {str(e)}")