fartinalbania commited on
Commit
a012b32
·
verified ·
1 Parent(s): 513261a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -29
app.py CHANGED
@@ -1,7 +1,10 @@
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import torch
6
  import logging
7
  import gradio as gr
@@ -21,14 +24,51 @@ app.add_middleware(
21
  allow_headers=["*"],
22
  )
23
 
24
- MODEL_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
 
27
- # Load model and tokenizer
28
  print("Loading model...")
29
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
30
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32).to(device)
31
- print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # PowerThought System Prompt
34
  POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
@@ -259,35 +299,51 @@ def generate_response(conversation_history, max_new_tokens=1500):
259
  try:
260
  messages = build_messages(conversation_history)
261
 
262
- # Apply chat template
263
- text = tokenizer.apply_chat_template(
264
- messages,
265
- tokenize=False,
266
- add_generation_prompt=True
267
- )
268
-
269
- # Tokenize
270
- inputs = tokenizer(text, return_tensors="pt").to(device)
271
-
272
- # Generate
273
- with torch.no_grad():
274
- generated_ids = model.generate(
275
- **inputs,
276
  max_new_tokens=max_new_tokens,
277
  do_sample=True,
278
  temperature=0.7,
279
  top_p=0.9,
280
  repetition_penalty=1.05,
281
- pad_token_id=tokenizer.eos_token_id
282
  )
 
283
 
284
- # Decode only the new tokens
285
- generated_text = tokenizer.decode(
286
- generated_ids[0][inputs.input_ids.shape[-1]:],
287
- skip_special_tokens=True
288
- )
289
-
290
- return generated_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  except Exception as e:
293
  logger.error(f"Generation error: {str(e)}")
@@ -305,7 +361,13 @@ async def chat_endpoint(request: ChatRequest):
305
 
306
  @app.get("/api/health")
307
  async def health_check():
308
- return {"status": "healthy", "model": MODEL_ID}
 
 
 
 
 
 
309
 
310
  # Gradio interface function
311
  def gradio_chat(message, history):
 
1
+ # PowerThought FastAPI Chat Server
2
+ # Requirements: pip install fastapi transformers torch gradio uvicorn accelerate bitsandbytes
3
+
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from pydantic import BaseModel
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
  import torch
9
  import logging
10
  import gradio as gr
 
24
  allow_headers=["*"],
25
  )
26
 
27
+ MODEL_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit"
28
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
 
30
+ # Load model and tokenizer with better error handling
31
  print("Loading model...")
32
+ pipe = None # Initialize pipeline variable
33
+
34
+ try:
35
+ tokenizer = AutoTokenizer.from_pretrained(
36
+ MODEL_ID,
37
+ trust_remote_code=True,
38
+ use_fast=True
39
+ )
40
+
41
+ # Add pad token if it doesn't exist
42
+ if tokenizer.pad_token is None:
43
+ tokenizer.pad_token = tokenizer.eos_token
44
+
45
+ model = AutoModelForCausalLM.from_pretrained(
46
+ MODEL_ID,
47
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
48
+ device_map="auto" if torch.cuda.is_available() else None,
49
+ trust_remote_code=True
50
+ )
51
+ print("Model loaded successfully!")
52
+
53
+ except Exception as e:
54
+ print(f"Error loading model: {e}")
55
+ print("Falling back to pipeline method...")
56
+
57
+ # Fallback to pipeline method
58
+ try:
59
+ pipe = pipeline(
60
+ "text-generation",
61
+ model=MODEL_ID,
62
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
63
+ device_map="auto" if torch.cuda.is_available() else None,
64
+ trust_remote_code=True
65
+ )
66
+ tokenizer = pipe.tokenizer
67
+ model = pipe.model
68
+ print("Pipeline fallback loaded successfully!")
69
+ except Exception as e2:
70
+ print(f"Pipeline fallback also failed: {e2}")
71
+ raise Exception(f"Both loading methods failed: {e}, {e2}")
72
 
73
  # PowerThought System Prompt
74
  POWERTHOUGHT_SYSTEM_PROMPT = """You are PowerThought, a strategic advisor who transforms the 48 Laws of Power into ethical, constructive guidance. You help people navigate complex situations using timeless wisdom while maintaining integrity and building positive relationships.
 
299
  try:
300
  messages = build_messages(conversation_history)
301
 
302
+ # Check if we're using pipeline or direct model
303
+ if pipe is not None:
304
+ # Using pipeline method
305
+ response = pipe(
306
+ messages,
 
 
 
 
 
 
 
 
 
307
  max_new_tokens=max_new_tokens,
308
  do_sample=True,
309
  temperature=0.7,
310
  top_p=0.9,
311
  repetition_penalty=1.05,
312
+ return_full_text=False
313
  )
314
+ return response[0]['generated_text'].strip()
315
 
316
+ else:
317
+ # Using direct model method
318
+ # Apply chat template
319
+ text = tokenizer.apply_chat_template(
320
+ messages,
321
+ tokenize=False,
322
+ add_generation_prompt=True
323
+ )
324
+
325
+ # Tokenize
326
+ inputs = tokenizer(text, return_tensors="pt").to(device)
327
+
328
+ # Generate
329
+ with torch.no_grad():
330
+ generated_ids = model.generate(
331
+ **inputs,
332
+ max_new_tokens=max_new_tokens,
333
+ do_sample=True,
334
+ temperature=0.7,
335
+ top_p=0.9,
336
+ repetition_penalty=1.05,
337
+ pad_token_id=tokenizer.eos_token_id
338
+ )
339
+
340
+ # Decode only the new tokens
341
+ generated_text = tokenizer.decode(
342
+ generated_ids[0][inputs.input_ids.shape[-1]:],
343
+ skip_special_tokens=True
344
+ )
345
+
346
+ return generated_text.strip()
347
 
348
  except Exception as e:
349
  logger.error(f"Generation error: {str(e)}")
 
361
 
362
  @app.get("/api/health")
363
  async def health_check():
364
+ loading_method = "pipeline" if pipe is not None else "direct"
365
+ return {
366
+ "status": "healthy",
367
+ "model": MODEL_ID,
368
+ "loading_method": loading_method,
369
+ "device": str(device)
370
+ }
371
 
372
  # Gradio interface function
373
  def gradio_chat(message, history):