Spaces:

AlessandroCossard
/

InfiniteDungeon

Running

App Files Files Community

AlessandroCossard commited on 13 days ago

Commit

06b668f

verified ·

1 Parent(s): 92f4773

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -55

app.py CHANGED Viewed

@@ -4,14 +4,14 @@ import os
 import requests
 import time
-# Percorso locale del modello - Llama-2-7B-Chat più potente
-MODEL_PATH = "llama-2-7b-chat.Q4_K_M.gguf"
-MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
 def download_model():
     """Scarica il modello se non esiste già"""
     if not os.path.exists(MODEL_PATH):
-        print("📥 Downloading Llama-2-7B-Chat model...")
         try:
             response = requests.get(MODEL_URL, stream=True, timeout=300)
             response.raise_for_status()
@@ -29,7 +29,7 @@ def download_model():
                             print(f"📥 Download progress: {progress:.1f}%")
             # Verifica che il file sia completo
-            if os.path.getsize(MODEL_PATH) < 1000000:  # Almeno 1MB
                 print("❌ Downloaded file seems corrupted")
                 os.remove(MODEL_PATH)
                 return False
@@ -45,7 +45,7 @@ def download_model():
     else:
         print("✅ Model already exists!")
         # Verifica che il file esistente sia valido
-        if os.path.getsize(MODEL_PATH) < 1000000:
             print("❌ Existing file seems corrupted, re-downloading...")
             os.remove(MODEL_PATH)
             return download_model()  # Riprova
@@ -56,28 +56,32 @@ model_loaded = download_model()
 llm = None  # Inizializza a None
 if model_loaded:
-    # Inizializza il modello OTTIMIZZATO per HF Free
     try:
         llm = Llama(
             model_path=MODEL_PATH,
-            n_ctx=256,  # MOLTO ridotto per HF Free
-            n_threads=2,  # Solo 2 CPU cores su HF Free
-            n_batch=128,  # Ridotto per memoria limitata
             use_mlock=False,  # Disabilitato per HF Free
             verbose=False,
             n_gpu_layers=0,
             use_mmap=True,  # Usa memory mapping per efficienza
-            low_vram=True   # Modalità low memory
         )
-        print("✅ Model loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading model: {e}")
         llm = None
 else:
     print("❌ Model not available, using fallback responses")
-# System prompt COMPATTO per velocità
-system_prompt = """You are a D&D Dungeon Master. Be concise but vivid. Always end with a question or choice for the player."""
 def generate_random_opening():
     """Genera un inizio casuale per l'avventura usando l'AI"""
@@ -92,23 +96,24 @@ def generate_random_opening():
         return f"🌟 **New Adventure!** 🌟\n\n{random.choice(openings)}"
     try:
-        # Prompt per generare apertura
-        opening_prompt = f"{system_prompt}\n\nGenerate a creative D&D adventure opening in 1-2 sentences. End with a question for the player.\nDM:"
         output = llm(
             opening_prompt,
-            max_tokens=50,
             temperature=0.8,
             top_p=0.9,
             repeat_penalty=1.1,
-            stop=["Player:", "Human:", "\n\n"]
         )
         opening = output["choices"][0]["text"].strip()
-        if opening.startswith("DM:"):
-            opening = opening[3:].strip()
         # Assicurati che finisca con una domanda
         if not opening.endswith('?'):
             opening += " What do you do?"
@@ -121,8 +126,8 @@ def generate_random_opening():
 chat_history = []
-def generate_dm_response_with_timeout(message, timeout=45):
-    """Genera risposta con timeout per evitare blocchi"""
     if llm is None:
         # Fallback responses se il modello non è disponibile
         import random
@@ -136,42 +141,43 @@ def generate_dm_response_with_timeout(message, timeout=45):
         return random.choice(fallbacks)
     try:
-        # Prompt MOLTO compatto per velocità
-        prompt = f"{system_prompt}\n\n"
-        # Solo ultimo turno per velocità massima
-        if chat_history:
-            last_turn = chat_history[-1]
-            prompt += f"Player: {last_turn['user']}\nDM: {last_turn['ai']}\n"
-        prompt += f"Player: {message}\nDM:"
-        # Parametri ULTRA ottimizzati per HF Free
         start_time = time.time()
         output = llm(
             prompt,
-            max_tokens=60,  # MOLTO ridotto per velocità
-            stop=["Player:", "Human:", "\n\nPlayer:", "\n\nHuman:", "\n\n"],
-            temperature=0.6,  # Ridotto per velocità
-            top_p=0.7,
-            repeat_penalty=1.3,  # Più alto per evitare loop
-            top_k=20  # Limita scelte per velocità
         )
         # Verifica se ha impiegato troppo tempo
-        if time.time() - start_time > timeout:
-            print("Response took too long!")
             return "Time passes quickly. What do you do next?"
         text = output["choices"][0]["text"].strip()
-        if text.startswith("DM:"):
-            text = text[3:].strip()
         # Assicurati che ci sia sempre una domanda
         if not text.endswith(('?', '!', '.')):
             text += "?"
         return text
     except Exception as e:
@@ -184,13 +190,13 @@ def chat(message, history):
     if not message.strip():
         return "You stand there, unsure. What would you like to do?"
-    # Genera risposta del DM con timeout
     dm_response = generate_dm_response_with_timeout(message)
-    # Aggiorna cronologia (mantieni solo ultimi 2 turni per memoria)
     chat_history.append({"user": message, "ai": dm_response})
-    if len(chat_history) > 2:
-        chat_history = chat_history[-2:]
     return dm_response
@@ -199,22 +205,22 @@ def reset():
     chat_history = []
     return generate_random_opening()
-# Crea l'interfaccia OTTIMIZZATA
-with gr.Blocks(title="Infinite Dungeon", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🐉 Infinite Dungeon")
-    gr.Markdown("*Fast AI-powered D&D adventure optimized for Hugging Face Free*")
-    gr.Markdown("⚡ **Optimized for speed - responses in 10-30 seconds**")
-    # Inizializza la chat (senza generare subito per evitare problemi di caricamento)
     chatbot = gr.Chatbot(
-        value=[(None, "🌟 **Loading your adventure...** 🌟\n\nPress 'New Adventure' to begin!")],
         height=400,
         show_label=False
     )
     msg = gr.Textbox(
         label="Your action",
-        placeholder="Keep it short: 'I attack', 'I search the room', 'I talk to the NPC'...",
         max_lines=2
     )
@@ -222,7 +228,7 @@ with gr.Blocks(title="Infinite Dungeon", theme=gr.themes.Soft()) as demo:
         submit = gr.Button("⚔️ Act", variant="primary", size="lg")
         reset_btn = gr.Button("🔄 New Adventure", variant="secondary")
-    gr.Markdown("💡 **Tips**: Keep actions short and simple for faster responses!")
     # Funzione per gestire la chat
     def respond(message, chat_history_ui):

 import requests
 import time
+# Percorso locale del modello - Qwen2.5-0.5B-Instruct VELOCE
+MODEL_PATH = "qwen2.5-0.5b-instruct-q4_k_m.gguf"
+MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
 def download_model():
     """Scarica il modello se non esiste già"""
     if not os.path.exists(MODEL_PATH):
+        print("📥 Downloading Qwen2.5-0.5B-Instruct model...")
         try:
             response = requests.get(MODEL_URL, stream=True, timeout=300)
             response.raise_for_status()
                             print(f"📥 Download progress: {progress:.1f}%")
             # Verifica che il file sia completo
+            if os.path.getsize(MODEL_PATH) < 100000:  # Almeno 100KB
                 print("❌ Downloaded file seems corrupted")
                 os.remove(MODEL_PATH)
                 return False
     else:
         print("✅ Model already exists!")
         # Verifica che il file esistente sia valido
+        if os.path.getsize(MODEL_PATH) < 100000:
             print("❌ Existing file seems corrupted, re-downloading...")
             os.remove(MODEL_PATH)
             return download_model()  # Riprova
 llm = None  # Inizializza a None
 if model_loaded:
+    # Inizializza il modello SUPER OTTIMIZZATO con Qwen2.5-0.5B
     try:
         llm = Llama(
             model_path=MODEL_PATH,
+            n_ctx=2048,  # Aumentato grazie al modello più piccolo
+            n_threads=4,  # Più thread possibili con modello piccolo
+            n_batch=256,  # Batch size ottimizzato
             use_mlock=False,  # Disabilitato per HF Free
             verbose=False,
             n_gpu_layers=0,
             use_mmap=True,  # Usa memory mapping per efficienza
+            low_vram=True,  # Modalità low memory
+            rope_scaling_type=1,  # Ottimizzazione RoPE
+            rope_freq_base=10000.0
         )
+        print("✅ Qwen2.5-0.5B Model loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading model: {e}")
         llm = None
 else:
     print("❌ Model not available, using fallback responses")
+# System prompt OTTIMIZZATO per Qwen2.5
+system_prompt = """<|im_start|>system
+You are an expert D&D Dungeon Master. Create immersive, engaging adventures with vivid descriptions. Always end your responses with a question or choice for the player. Keep responses concise but atmospheric.
+<|im_end|>"""
 def generate_random_opening():
     """Genera un inizio casuale per l'avventura usando l'AI"""
         return f"🌟 **New Adventure!** 🌟\n\n{random.choice(openings)}"
     try:
+        # Prompt ottimizzato per Qwen2.5
+        opening_prompt = f"""{system_prompt}
+<|im_start|>user
+Generate a creative D&D adventure opening in 2-3 sentences. Set an intriguing scene and end with a question for the player.
+<|im_end|>
+<|im_start|>assistant"""
         output = llm(
             opening_prompt,
+            max_tokens=80,  # Leggermente più alto per qualità
             temperature=0.8,
             top_p=0.9,
             repeat_penalty=1.1,
+            stop=["<|im_end|>", "<|im_start|>", "User:", "Player:"]
         )
         opening = output["choices"][0]["text"].strip()
         # Assicurati che finisca con una domanda
         if not opening.endswith('?'):
             opening += " What do you do?"
 chat_history = []
+def generate_dm_response_with_timeout(message, timeout=30):
+    """Genera risposta con timeout ridotto per velocità"""
     if llm is None:
         # Fallback responses se il modello non è disponibile
         import random
         return random.choice(fallbacks)
     try:
+        # Prompt ottimizzato per Qwen2.5 con chat template
+        prompt = f"{system_prompt}\n"
+        # Mantieni più contesto grazie al modello efficiente
+        context_turns = min(len(chat_history), 3)  # Ultimi 3 turni
+        for turn in chat_history[-context_turns:]:
+            prompt += f"<|im_start|>user\n{turn['user']}\n<|im_end|>\n"
+            prompt += f"<|im_start|>assistant\n{turn['ai']}\n<|im_end|>\n"
+        prompt += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
+        # Parametri ottimizzati per Qwen2.5-0.5B
         start_time = time.time()
         output = llm(
             prompt,
+            max_tokens=100,  # Aumentato per qualità migliore
+            stop=["<|im_end|>", "<|im_start|>", "User:", "Player:"],
+            temperature=0.7,
+            top_p=0.8,
+            repeat_penalty=1.2,
+            top_k=40,
+            min_p=0.1  # Miglior controllo qualità
         )
         # Verifica se ha impiegato troppo tempo
+        elapsed_time = time.time() - start_time
+        if elapsed_time > timeout:
+            print(f"Response took {elapsed_time:.1f}s (timeout: {timeout}s)")
             return "Time passes quickly. What do you do next?"
         text = output["choices"][0]["text"].strip()
         # Assicurati che ci sia sempre una domanda
         if not text.endswith(('?', '!', '.')):
             text += "?"
+        print(f"✅ Response generated in {elapsed_time:.1f}s")
         return text
     except Exception as e:
     if not message.strip():
         return "You stand there, unsure. What would you like to do?"
+    # Genera risposta del DM con timeout ridotto
     dm_response = generate_dm_response_with_timeout(message)
+    # Aggiorna cronologia (mantieni più turni grazie al modello efficiente)
     chat_history.append({"user": message, "ai": dm_response})
+    if len(chat_history) > 5:  # Mantieni 5 turni invece di 2
+        chat_history = chat_history[-5:]
     return dm_response
     chat_history = []
     return generate_random_opening()
+# Crea l'interfaccia SUPER OTTIMIZZATA
+with gr.Blocks(title="Infinite Dungeon - Lightning Fast", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚡ Infinite Dungeon - Lightning Fast")
+    gr.Markdown("*Powered by Qwen2.5-0.5B - Optimized for 5-15 second responses*")
+    gr.Markdown("🚀 **Super fast AI D&D with perfect memory retention**")
+    # Inizializza la chat
     chatbot = gr.Chatbot(
+        value=[(None, "⚡ **Lightning Fast Adventure Ready!** ⚡\n\nPress 'New Adventure' to begin your quest!")],
         height=400,
         show_label=False
     )
     msg = gr.Textbox(
         label="Your action",
+        placeholder="What do you do? (e.g., 'I search the room', 'I attack the orc', 'I cast a spell')",
         max_lines=2
     )
         submit = gr.Button("⚔️ Act", variant="primary", size="lg")
         reset_btn = gr.Button("🔄 New Adventure", variant="secondary")
+    gr.Markdown("⚡ **Ultra-fast responses**: 5-15 seconds | 🧠 **Perfect memory**: Never forgets your adventure!")
     # Funzione per gestire la chat
     def respond(message, chat_history_ui):