fartinalbania commited on
Commit
b3c00c9
·
verified ·
1 Parent(s): 73ad8cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -48
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # PowerThought FastAPI Chat Server
2
- # Requirements: pip install fastapi transformers torch gradio uvicorn accelerate bitsandbytes
 
3
 
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.middleware.cors import CORSMiddleware
@@ -24,51 +25,100 @@ app.add_middleware(
24
  allow_headers=["*"],
25
  )
26
 
27
- MODEL_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit"
 
 
 
 
 
 
 
28
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
29
 
30
- # Load model and tokenizer with better error handling
31
  print("Loading model...")
32
- pipe = None # Initialize pipeline variable
33
-
34
- try:
35
- tokenizer = AutoTokenizer.from_pretrained(
36
- MODEL_ID,
37
- trust_remote_code=True,
38
- use_fast=True
39
- )
40
-
41
- # Add pad token if it doesn't exist
42
- if tokenizer.pad_token is None:
43
- tokenizer.pad_token = tokenizer.eos_token
44
-
45
- model = AutoModelForCausalLM.from_pretrained(
46
- MODEL_ID,
47
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
48
- device_map="auto" if torch.cuda.is_available() else None,
49
- trust_remote_code=True
50
- )
51
- print("Model loaded successfully!")
52
-
53
- except Exception as e:
54
- print(f"Error loading model: {e}")
55
- print("Falling back to pipeline method...")
56
-
57
- # Fallback to pipeline method
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
 
59
  pipe = pipeline(
60
  "text-generation",
61
- model=MODEL_ID,
62
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
63
- device_map="auto" if torch.cuda.is_available() else None,
64
- trust_remote_code=True
65
  )
66
  tokenizer = pipe.tokenizer
67
  model = pipe.model
68
- print("Pipeline fallback loaded successfully!")
69
- except Exception as e2:
70
- print(f"Pipeline fallback also failed: {e2}")
71
- raise Exception(f"Both loading methods failed: {e}, {e2}")
 
 
 
 
 
 
72
 
73
  # PowerThought System Prompt
74
  POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
@@ -299,9 +349,9 @@ def generate_response(conversation_history, max_new_tokens=1500):
299
  try:
300
  messages = build_messages(conversation_history)
301
 
302
- # Check if we're using pipeline or direct model
303
  if pipe is not None:
304
- # Using pipeline method
305
  response = pipe(
306
  messages,
307
  max_new_tokens=max_new_tokens,
@@ -313,14 +363,26 @@ def generate_response(conversation_history, max_new_tokens=1500):
313
  )
314
  return response[0]['generated_text'].strip()
315
 
316
- else:
317
- # Using direct model method
318
- # Apply chat template
319
- text = tokenizer.apply_chat_template(
320
- messages,
321
- tokenize=False,
322
- add_generation_prompt=True
323
- )
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  # Tokenize
326
  inputs = tokenizer(text, return_tensors="pt").to(device)
@@ -334,7 +396,8 @@ def generate_response(conversation_history, max_new_tokens=1500):
334
  temperature=0.7,
335
  top_p=0.9,
336
  repetition_penalty=1.05,
337
- pad_token_id=tokenizer.eos_token_id
 
338
  )
339
 
340
  # Decode only the new tokens
@@ -344,6 +407,32 @@ def generate_response(conversation_history, max_new_tokens=1500):
344
  )
345
 
346
  return generated_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  except Exception as e:
349
  logger.error(f"Generation error: {str(e)}")
 
1
  # PowerThought FastAPI Chat Server
2
+ # Requirements: pip install fastapi transformers torch gradio uvicorn accelerate
3
+ # Optional for GPU quantization: pip install bitsandbytes
4
 
5
  from fastapi import FastAPI, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
 
25
  allow_headers=["*"],
26
  )
27
 
28
+ MODEL_ID = "microsoft/DialoGPT-large" # Fallback reliable model
29
+ PREFERRED_MODEL = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit" # Preferred but needs GPU
30
+ FALLBACK_MODELS = [
31
+ "microsoft/DialoGPT-medium",
32
+ "microsoft/DialoGPT-small",
33
+ "gpt2"
34
+ ]
35
+
36
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ print(f"Device detected: {device}")
38
 
39
+ # Load model and tokenizer with multiple fallbacks
40
  print("Loading model...")
41
+ pipe = None
42
+ model = None
43
+ tokenizer = None
44
+ current_model = None
45
+
46
+ def try_load_model(model_id, use_quantization=False):
47
+ """Try to load a specific model"""
48
+ try:
49
+ print(f"Attempting to load: {model_id}")
50
+
51
+ if use_quantization and torch.cuda.is_available():
52
+ # Try quantized version on GPU
53
+ tokenizer = AutoTokenizer.from_pretrained(
54
+ model_id,
55
+ trust_remote_code=True,
56
+ use_fast=True
57
+ )
58
+ model = AutoModelForCausalLM.from_pretrained(
59
+ model_id,
60
+ torch_dtype=torch.float16,
61
+ device_map="auto",
62
+ trust_remote_code=True
63
+ )
64
+ else:
65
+ # Try regular version
66
+ tokenizer = AutoTokenizer.from_pretrained(
67
+ model_id,
68
+ trust_remote_code=True
69
+ )
70
+ model = AutoModelForCausalLM.from_pretrained(
71
+ model_id,
72
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
73
+ trust_remote_code=True
74
+ ).to(device)
75
+
76
+ # Add pad token if needed
77
+ if tokenizer.pad_token is None:
78
+ tokenizer.pad_token = tokenizer.eos_token
79
+
80
+ return tokenizer, model, model_id
81
+
82
+ except Exception as e:
83
+ print(f"Failed to load {model_id}: {e}")
84
+ return None, None, None
85
+
86
+ # Try preferred model first (with quantization if GPU available)
87
+ if torch.cuda.is_available():
88
+ tokenizer, model, current_model = try_load_model(PREFERRED_MODEL, use_quantization=True)
89
+
90
+ # If that failed, try regular DeepSeek
91
+ if model is None:
92
+ tokenizer, model, current_model = try_load_model("deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", use_quantization=False)
93
+
94
+ # If that failed, try fallback models
95
+ if model is None:
96
+ for fallback_model in FALLBACK_MODELS:
97
+ tokenizer, model, current_model = try_load_model(fallback_model, use_quantization=False)
98
+ if model is not None:
99
+ break
100
+
101
+ # Final fallback to pipeline method with GPT-2
102
+ if model is None:
103
  try:
104
+ print("Using pipeline fallback with GPT-2...")
105
  pipe = pipeline(
106
  "text-generation",
107
+ model="gpt2",
108
+ tokenizer="gpt2"
 
 
109
  )
110
  tokenizer = pipe.tokenizer
111
  model = pipe.model
112
+ current_model = "gpt2"
113
+ print("Pipeline with GPT-2 loaded successfully!")
114
+ except Exception as e:
115
+ raise Exception(f"All loading methods failed. Last error: {e}")
116
+
117
+ if model is not None:
118
+ MODEL_ID = current_model # Update MODEL_ID to reflect what actually loaded
119
+ print(f"Successfully loaded: {MODEL_ID}")
120
+ else:
121
+ raise Exception("Failed to load any model")
122
 
123
  # PowerThought System Prompt
124
  POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
 
349
  try:
350
  messages = build_messages(conversation_history)
351
 
352
+ # Check if we're using pipeline
353
  if pipe is not None:
354
+ # For pipeline method
355
  response = pipe(
356
  messages,
357
  max_new_tokens=max_new_tokens,
 
363
  )
364
  return response[0]['generated_text'].strip()
365
 
366
+ # For direct model method
367
+ try:
368
+ # Try chat template first (for modern models)
369
+ if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
370
+ text = tokenizer.apply_chat_template(
371
+ messages,
372
+ tokenize=False,
373
+ add_generation_prompt=True
374
+ )
375
+ else:
376
+ # Fallback for older models (like DialoGPT, GPT-2)
377
+ text = ""
378
+ for msg in messages:
379
+ if msg["role"] == "system":
380
+ text += f"System: {msg['content']}\n\n"
381
+ elif msg["role"] == "user":
382
+ text += f"User: {msg['content']}\n"
383
+ elif msg["role"] == "assistant":
384
+ text += f"Assistant: {msg['content']}\n"
385
+ text += "Assistant: "
386
 
387
  # Tokenize
388
  inputs = tokenizer(text, return_tensors="pt").to(device)
 
396
  temperature=0.7,
397
  top_p=0.9,
398
  repetition_penalty=1.05,
399
+ pad_token_id=tokenizer.eos_token_id,
400
+ eos_token_id=tokenizer.eos_token_id
401
  )
402
 
403
  # Decode only the new tokens
 
407
  )
408
 
409
  return generated_text.strip()
410
+
411
+ except Exception as e:
412
+ logger.error(f"Chat template failed, using simple concatenation: {e}")
413
+
414
+ # Simple fallback - just concatenate the last user message with system prompt
415
+ full_text = f"{POWERTHOUGHT_SYSTEM_PROMPT}\n\nUser: {conversation_history[-1]['content']}\nAssistant: "
416
+
417
+ inputs = tokenizer(full_text, return_tensors="pt").to(device)
418
+
419
+ with torch.no_grad():
420
+ generated_ids = model.generate(
421
+ **inputs,
422
+ max_new_tokens=max_new_tokens,
423
+ do_sample=True,
424
+ temperature=0.7,
425
+ top_p=0.9,
426
+ repetition_penalty=1.05,
427
+ pad_token_id=tokenizer.eos_token_id
428
+ )
429
+
430
+ generated_text = tokenizer.decode(
431
+ generated_ids[0][inputs.input_ids.shape[-1]:],
432
+ skip_special_tokens=True
433
+ )
434
+
435
+ return generated_text.strip()
436
 
437
  except Exception as e:
438
  logger.error(f"Generation error: {str(e)}")