AlessandroCossard commited on
Commit
06b668f
·
verified ·
1 Parent(s): 92f4773

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -55
app.py CHANGED
@@ -4,14 +4,14 @@ import os
4
  import requests
5
  import time
6
 
7
- # Percorso locale del modello - Llama-2-7B-Chat più potente
8
- MODEL_PATH = "llama-2-7b-chat.Q4_K_M.gguf"
9
- MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
10
 
11
  def download_model():
12
  """Scarica il modello se non esiste già"""
13
  if not os.path.exists(MODEL_PATH):
14
- print("📥 Downloading Llama-2-7B-Chat model...")
15
  try:
16
  response = requests.get(MODEL_URL, stream=True, timeout=300)
17
  response.raise_for_status()
@@ -29,7 +29,7 @@ def download_model():
29
  print(f"📥 Download progress: {progress:.1f}%")
30
 
31
  # Verifica che il file sia completo
32
- if os.path.getsize(MODEL_PATH) < 1000000: # Almeno 1MB
33
  print("❌ Downloaded file seems corrupted")
34
  os.remove(MODEL_PATH)
35
  return False
@@ -45,7 +45,7 @@ def download_model():
45
  else:
46
  print("✅ Model already exists!")
47
  # Verifica che il file esistente sia valido
48
- if os.path.getsize(MODEL_PATH) < 1000000:
49
  print("❌ Existing file seems corrupted, re-downloading...")
50
  os.remove(MODEL_PATH)
51
  return download_model() # Riprova
@@ -56,28 +56,32 @@ model_loaded = download_model()
56
  llm = None # Inizializza a None
57
 
58
  if model_loaded:
59
- # Inizializza il modello OTTIMIZZATO per HF Free
60
  try:
61
  llm = Llama(
62
  model_path=MODEL_PATH,
63
- n_ctx=256, # MOLTO ridotto per HF Free
64
- n_threads=2, # Solo 2 CPU cores su HF Free
65
- n_batch=128, # Ridotto per memoria limitata
66
  use_mlock=False, # Disabilitato per HF Free
67
  verbose=False,
68
  n_gpu_layers=0,
69
  use_mmap=True, # Usa memory mapping per efficienza
70
- low_vram=True # Modalità low memory
 
 
71
  )
72
- print("✅ Model loaded successfully!")
73
  except Exception as e:
74
  print(f"❌ Error loading model: {e}")
75
  llm = None
76
  else:
77
  print("❌ Model not available, using fallback responses")
78
 
79
- # System prompt COMPATTO per velocità
80
- system_prompt = """You are a D&D Dungeon Master. Be concise but vivid. Always end with a question or choice for the player."""
 
 
81
 
82
  def generate_random_opening():
83
  """Genera un inizio casuale per l'avventura usando l'AI"""
@@ -92,23 +96,24 @@ def generate_random_opening():
92
  return f"🌟 **New Adventure!** 🌟\n\n{random.choice(openings)}"
93
 
94
  try:
95
- # Prompt per generare apertura
96
- opening_prompt = f"{system_prompt}\n\nGenerate a creative D&D adventure opening in 1-2 sentences. End with a question for the player.\nDM:"
 
 
 
 
97
 
98
  output = llm(
99
  opening_prompt,
100
- max_tokens=50,
101
  temperature=0.8,
102
  top_p=0.9,
103
  repeat_penalty=1.1,
104
- stop=["Player:", "Human:", "\n\n"]
105
  )
106
 
107
  opening = output["choices"][0]["text"].strip()
108
 
109
- if opening.startswith("DM:"):
110
- opening = opening[3:].strip()
111
-
112
  # Assicurati che finisca con una domanda
113
  if not opening.endswith('?'):
114
  opening += " What do you do?"
@@ -121,8 +126,8 @@ def generate_random_opening():
121
 
122
  chat_history = []
123
 
124
- def generate_dm_response_with_timeout(message, timeout=45):
125
- """Genera risposta con timeout per evitare blocchi"""
126
  if llm is None:
127
  # Fallback responses se il modello non è disponibile
128
  import random
@@ -136,42 +141,43 @@ def generate_dm_response_with_timeout(message, timeout=45):
136
  return random.choice(fallbacks)
137
 
138
  try:
139
- # Prompt MOLTO compatto per velocità
140
- prompt = f"{system_prompt}\n\n"
141
 
142
- # Solo ultimo turno per velocità massima
143
- if chat_history:
144
- last_turn = chat_history[-1]
145
- prompt += f"Player: {last_turn['user']}\nDM: {last_turn['ai']}\n"
 
146
 
147
- prompt += f"Player: {message}\nDM:"
148
 
149
- # Parametri ULTRA ottimizzati per HF Free
150
  start_time = time.time()
151
  output = llm(
152
  prompt,
153
- max_tokens=60, # MOLTO ridotto per velocità
154
- stop=["Player:", "Human:", "\n\nPlayer:", "\n\nHuman:", "\n\n"],
155
- temperature=0.6, # Ridotto per velocità
156
- top_p=0.7,
157
- repeat_penalty=1.3, # Più alto per evitare loop
158
- top_k=20 # Limita scelte per velocità
 
159
  )
160
 
161
  # Verifica se ha impiegato troppo tempo
162
- if time.time() - start_time > timeout:
163
- print("Response took too long!")
 
164
  return "Time passes quickly. What do you do next?"
165
 
166
  text = output["choices"][0]["text"].strip()
167
 
168
- if text.startswith("DM:"):
169
- text = text[3:].strip()
170
-
171
  # Assicurati che ci sia sempre una domanda
172
  if not text.endswith(('?', '!', '.')):
173
  text += "?"
174
 
 
175
  return text
176
 
177
  except Exception as e:
@@ -184,13 +190,13 @@ def chat(message, history):
184
  if not message.strip():
185
  return "You stand there, unsure. What would you like to do?"
186
 
187
- # Genera risposta del DM con timeout
188
  dm_response = generate_dm_response_with_timeout(message)
189
 
190
- # Aggiorna cronologia (mantieni solo ultimi 2 turni per memoria)
191
  chat_history.append({"user": message, "ai": dm_response})
192
- if len(chat_history) > 2:
193
- chat_history = chat_history[-2:]
194
 
195
  return dm_response
196
 
@@ -199,22 +205,22 @@ def reset():
199
  chat_history = []
200
  return generate_random_opening()
201
 
202
- # Crea l'interfaccia OTTIMIZZATA
203
- with gr.Blocks(title="Infinite Dungeon", theme=gr.themes.Soft()) as demo:
204
- gr.Markdown("# 🐉 Infinite Dungeon")
205
- gr.Markdown("*Fast AI-powered D&D adventure optimized for Hugging Face Free*")
206
- gr.Markdown(" **Optimized for speed - responses in 10-30 seconds**")
207
 
208
- # Inizializza la chat (senza generare subito per evitare problemi di caricamento)
209
  chatbot = gr.Chatbot(
210
- value=[(None, "🌟 **Loading your adventure...** 🌟\n\nPress 'New Adventure' to begin!")],
211
  height=400,
212
  show_label=False
213
  )
214
 
215
  msg = gr.Textbox(
216
  label="Your action",
217
- placeholder="Keep it short: 'I attack', 'I search the room', 'I talk to the NPC'...",
218
  max_lines=2
219
  )
220
 
@@ -222,7 +228,7 @@ with gr.Blocks(title="Infinite Dungeon", theme=gr.themes.Soft()) as demo:
222
  submit = gr.Button("⚔️ Act", variant="primary", size="lg")
223
  reset_btn = gr.Button("🔄 New Adventure", variant="secondary")
224
 
225
- gr.Markdown("💡 **Tips**: Keep actions short and simple for faster responses!")
226
 
227
  # Funzione per gestire la chat
228
  def respond(message, chat_history_ui):
 
4
  import requests
5
  import time
6
 
7
+ # Percorso locale del modello - Qwen2.5-0.5B-Instruct VELOCE
8
+ MODEL_PATH = "qwen2.5-0.5b-instruct-q4_k_m.gguf"
9
+ MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
10
 
11
  def download_model():
12
  """Scarica il modello se non esiste già"""
13
  if not os.path.exists(MODEL_PATH):
14
+ print("📥 Downloading Qwen2.5-0.5B-Instruct model...")
15
  try:
16
  response = requests.get(MODEL_URL, stream=True, timeout=300)
17
  response.raise_for_status()
 
29
  print(f"📥 Download progress: {progress:.1f}%")
30
 
31
  # Verifica che il file sia completo
32
+ if os.path.getsize(MODEL_PATH) < 100000: # Almeno 100KB
33
  print("❌ Downloaded file seems corrupted")
34
  os.remove(MODEL_PATH)
35
  return False
 
45
  else:
46
  print("✅ Model already exists!")
47
  # Verifica che il file esistente sia valido
48
+ if os.path.getsize(MODEL_PATH) < 100000:
49
  print("❌ Existing file seems corrupted, re-downloading...")
50
  os.remove(MODEL_PATH)
51
  return download_model() # Riprova
 
56
  llm = None # Inizializza a None
57
 
58
  if model_loaded:
59
+ # Inizializza il modello SUPER OTTIMIZZATO con Qwen2.5-0.5B
60
  try:
61
  llm = Llama(
62
  model_path=MODEL_PATH,
63
+ n_ctx=2048, # Aumentato grazie al modello più piccolo
64
+ n_threads=4, # Più thread possibili con modello piccolo
65
+ n_batch=256, # Batch size ottimizzato
66
  use_mlock=False, # Disabilitato per HF Free
67
  verbose=False,
68
  n_gpu_layers=0,
69
  use_mmap=True, # Usa memory mapping per efficienza
70
+ low_vram=True, # Modalità low memory
71
+ rope_scaling_type=1, # Ottimizzazione RoPE
72
+ rope_freq_base=10000.0
73
  )
74
+ print("✅ Qwen2.5-0.5B Model loaded successfully!")
75
  except Exception as e:
76
  print(f"❌ Error loading model: {e}")
77
  llm = None
78
  else:
79
  print("❌ Model not available, using fallback responses")
80
 
81
+ # System prompt OTTIMIZZATO per Qwen2.5
82
+ system_prompt = """<|im_start|>system
83
+ You are an expert D&D Dungeon Master. Create immersive, engaging adventures with vivid descriptions. Always end your responses with a question or choice for the player. Keep responses concise but atmospheric.
84
+ <|im_end|>"""
85
 
86
  def generate_random_opening():
87
  """Genera un inizio casuale per l'avventura usando l'AI"""
 
96
  return f"🌟 **New Adventure!** 🌟\n\n{random.choice(openings)}"
97
 
98
  try:
99
+ # Prompt ottimizzato per Qwen2.5
100
+ opening_prompt = f"""{system_prompt}
101
+ <|im_start|>user
102
+ Generate a creative D&D adventure opening in 2-3 sentences. Set an intriguing scene and end with a question for the player.
103
+ <|im_end|>
104
+ <|im_start|>assistant"""
105
 
106
  output = llm(
107
  opening_prompt,
108
+ max_tokens=80, # Leggermente più alto per qualità
109
  temperature=0.8,
110
  top_p=0.9,
111
  repeat_penalty=1.1,
112
+ stop=["<|im_end|>", "<|im_start|>", "User:", "Player:"]
113
  )
114
 
115
  opening = output["choices"][0]["text"].strip()
116
 
 
 
 
117
  # Assicurati che finisca con una domanda
118
  if not opening.endswith('?'):
119
  opening += " What do you do?"
 
126
 
127
  chat_history = []
128
 
129
+ def generate_dm_response_with_timeout(message, timeout=30):
130
+ """Genera risposta con timeout ridotto per velocità"""
131
  if llm is None:
132
  # Fallback responses se il modello non è disponibile
133
  import random
 
141
  return random.choice(fallbacks)
142
 
143
  try:
144
+ # Prompt ottimizzato per Qwen2.5 con chat template
145
+ prompt = f"{system_prompt}\n"
146
 
147
+ # Mantieni più contesto grazie al modello efficiente
148
+ context_turns = min(len(chat_history), 3) # Ultimi 3 turni
149
+ for turn in chat_history[-context_turns:]:
150
+ prompt += f"<|im_start|>user\n{turn['user']}\n<|im_end|>\n"
151
+ prompt += f"<|im_start|>assistant\n{turn['ai']}\n<|im_end|>\n"
152
 
153
+ prompt += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
154
 
155
+ # Parametri ottimizzati per Qwen2.5-0.5B
156
  start_time = time.time()
157
  output = llm(
158
  prompt,
159
+ max_tokens=100, # Aumentato per qualità migliore
160
+ stop=["<|im_end|>", "<|im_start|>", "User:", "Player:"],
161
+ temperature=0.7,
162
+ top_p=0.8,
163
+ repeat_penalty=1.2,
164
+ top_k=40,
165
+ min_p=0.1 # Miglior controllo qualità
166
  )
167
 
168
  # Verifica se ha impiegato troppo tempo
169
+ elapsed_time = time.time() - start_time
170
+ if elapsed_time > timeout:
171
+ print(f"Response took {elapsed_time:.1f}s (timeout: {timeout}s)")
172
  return "Time passes quickly. What do you do next?"
173
 
174
  text = output["choices"][0]["text"].strip()
175
 
 
 
 
176
  # Assicurati che ci sia sempre una domanda
177
  if not text.endswith(('?', '!', '.')):
178
  text += "?"
179
 
180
+ print(f"✅ Response generated in {elapsed_time:.1f}s")
181
  return text
182
 
183
  except Exception as e:
 
190
  if not message.strip():
191
  return "You stand there, unsure. What would you like to do?"
192
 
193
+ # Genera risposta del DM con timeout ridotto
194
  dm_response = generate_dm_response_with_timeout(message)
195
 
196
+ # Aggiorna cronologia (mantieni più turni grazie al modello efficiente)
197
  chat_history.append({"user": message, "ai": dm_response})
198
+ if len(chat_history) > 5: # Mantieni 5 turni invece di 2
199
+ chat_history = chat_history[-5:]
200
 
201
  return dm_response
202
 
 
205
  chat_history = []
206
  return generate_random_opening()
207
 
208
+ # Crea l'interfaccia SUPER OTTIMIZZATA
209
+ with gr.Blocks(title="Infinite Dungeon - Lightning Fast", theme=gr.themes.Soft()) as demo:
210
+ gr.Markdown("# Infinite Dungeon - Lightning Fast")
211
+ gr.Markdown("*Powered by Qwen2.5-0.5B - Optimized for 5-15 second responses*")
212
+ gr.Markdown("🚀 **Super fast AI D&D with perfect memory retention**")
213
 
214
+ # Inizializza la chat
215
  chatbot = gr.Chatbot(
216
+ value=[(None, " **Lightning Fast Adventure Ready!** ⚡\n\nPress 'New Adventure' to begin your quest!")],
217
  height=400,
218
  show_label=False
219
  )
220
 
221
  msg = gr.Textbox(
222
  label="Your action",
223
+ placeholder="What do you do? (e.g., 'I search the room', 'I attack the orc', 'I cast a spell')",
224
  max_lines=2
225
  )
226
 
 
228
  submit = gr.Button("⚔️ Act", variant="primary", size="lg")
229
  reset_btn = gr.Button("🔄 New Adventure", variant="secondary")
230
 
231
+ gr.Markdown(" **Ultra-fast responses**: 5-15 seconds | 🧠 **Perfect memory**: Never forgets your adventure!")
232
 
233
  # Funzione per gestire la chat
234
  def respond(message, chat_history_ui):