Granite4Tiny

Running on Zero

App Files Files Community

TobDeBer commited on 20 days ago

Commit

c3fb36e

verified ·

1 Parent(s): edb6fb6

refactor

Browse files

Files changed (1) hide show

app.py +89 -161

app.py CHANGED Viewed

@@ -1,113 +1,61 @@
 from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
-from threading import Thread
-from huggingface_hub import hf_hub_download, login
-from themes.research_monochrome import ResearchMonochrome
 from typing import Iterator, List, Dict
 import spaces
-import os
-import requests
-import json
-import subprocess
 import gradio as gr
-import atexit
-import time
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
-SYS_PROMPT = f"""Today's Date: {today_date}.
-You are Granite, developed by IBM. You are a helpful AI assistant"""
-TITLE = "IBM Granite 4 Tiny Preview served from local GGUF server"
-DESCRIPTION = """
-<p>Granite 4 Tiny is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
-<span class="gr_docs_link">
-<a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a>
-</span>
-</p>
-"""
-LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.7
 TOP_P = 0.85
 TOP_K = 50
 REPETITION_PENALTY = 1.05
-# Global variable to store the server process
-llama_process = None
-# Ensure the server process is killed when the application exits
-def cleanup_server():
-    global llama_process
-    if llama_process and llama_process.poll() is None:
-        print("Stopping llama-server process...")
-        llama_process.terminate()
-        llama_process.wait(timeout=5)
-atexit.register(cleanup_server)
-# determine platform: CUDA or CPU
-try:
-    subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
-    platform = "CUDA"
-except subprocess.CalledProcessError:
-    platform = "CPU"
-except FileNotFoundError:
-    platform = "CPU"
-platform = "CUDA" # override for ZERO, because the GPU is not available at the time download decision is done
-print(f"Detected platform {platform}")
 gguf_name = "granite-4.0-tiny-preview-Q4_K_M.gguf"
-gguf_path = hf_hub_download(
-            repo_id="ibm-granite/granite-4.0-tiny-preview-GGUF",
-            filename=gguf_name,
-            local_dir="."
-)
-# set exe_name depending on platform
-exe_name = "llama-server-6343-cuda" if platform == "CUDA" else "llama-server-6343-blas"
-exe_path = hf_hub_download(
-            repo_id="TobDeBer/Skipper",
-            filename=exe_name,
-            local_dir="."
 )
-subprocess.run(["chmod", "+x", exe_name])
-# --- New Decorated Function to Launch Server on GPU ---
-@spaces.GPU(duration=30)
-def start_llama_server():
-    global llama_process
-    if llama_process and llama_process.poll() is None:
-        print("Server is already running.")
-        return
-    server_env = os.environ.copy()
-    # 1. Define the command (now explicitly using the CUDA binary)
-    command = [
-        "./" + exe_name,
-        "-m", gguf_name,
-        "--temp", "0.0",
-        "-c", "2048",
-        "-t", "8",
-        "--port", "8081",
-        "--no-warmup",
-        "-ngl", "999"  # <--- CRUCIAL: GPU offload instruction
-    ]
-    # 2. Launch the server now that the GPU is guaranteed to be available
-    llama_process = subprocess.Popen(command, env=server_env)
-    print(f"Llama-server process started with PID {llama_process.pid}")
-    # You might need a small sleep here to wait for the server to initialize
-    time.sleep(5)
 custom_theme = ResearchMonochrome()
-print("Theme type:", type(custom_theme))
 @spaces.GPU(duration=30)
 def generate(
@@ -119,90 +67,70 @@ def generate(
     top_k: float = TOP_K,
     max_new_tokens: int = MAX_NEW_TOKENS,
 ) -> Iterator[str]:
-    """Generate function for chat demo using Llama.cpp server."""
-    # Ensure the server is running before attempting a generation request
-    # You'll need a more robust check in a production environment
-    if llama_process is None or llama_process.poll() is not None:
-        start_llama_server() # Restart if needed (or handle the error)
-    # Build messages
-    conversation = []
-    conversation.append({"role": "system", "content": SYS_PROMPT})
-    conversation += chat_history
-    conversation.append({"role": "user", "content": message})
-    # Prepare the prompt for the Llama.cpp server
-    prompt = ""
-    for item in conversation:
-      if item["role"] == "system":
-        prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n"
-      elif item["role"] == "user":
-        prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n"
-      elif item["role"] == "assistant":
-        prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n"
-    prompt += "<|model|>\n"  # Add the beginning token for the assistant
-    # Construct the request payload
-    payload = {
-        "prompt": prompt,
-        "stream": True,  # Enable streaming
-        "max_tokens": max_new_tokens,
-        "temperature": temperature,
-        "repeat_penalty": repetition_penalty,
-        "top_p": top_p,
-        "top_k": top_k,
-        "stop": ["<|file_separator|>"], #stops after it sees this
-    }
     try:
-        # Make the request to the Llama.cpp server
-        with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
-            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
-            # Stream the response from the server
-            outputs = []
-            for line in response.iter_lines():
-                if line:
-                    # Decode the line
-                    decoded_line = line.decode('utf-8')
-                    # Remove 'data: ' prefix if present
-                    if decoded_line.startswith("data: "):
-                        decoded_line = decoded_line[6:]
-                    # Handle potential JSON decoding errors
-                    try:
-                        json_data = json.loads(decoded_line)
-                        text = json_data.get("content", "")  # Extract content field. crucial.
-                        if text:
-                            outputs.append(text)
-                            yield "".join(outputs)
-                    except json.JSONDecodeError:
-                        print(f"JSONDecodeError: {decoded_line}")
-                        # Handle the error, potentially skipping the line or logging it.
-    except requests.exceptions.RequestException as e:
-        print(f"Request failed: {e}")
-        yield f"Error: {e}"  # Yield an error message to the user
     except Exception as e:
-        print(f"An unexpected error occurred: {e}")
-        yield f"Error: {e}" # Yield error message
 css_file_path = Path(Path(__file__).parent / "app.css")
 # advanced settings (displayed in Accordion)
 temperature_slider = gr.Slider(
-    minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
-)
 top_p_slider = gr.Slider(
-    minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
-)
 top_k_slider = gr.Slider(
-    minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
-)
 repetition_penalty_slider = gr.Slider(
     minimum=0,
     maximum=2.0,
@@ -253,4 +181,4 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=custom_theme, ti
     )
 if __name__ == "__main__":
-    demo.queue().launch()

 from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
 from typing import Iterator, List, Dict
+from huggingface_hub import hf_hub_download
+from themes.research_monochrome import ResearchMonochrome
 import spaces
 import gradio as gr
+from llama_cpp import Llama # <-- Neu: Llama-Klasse importieren
+import os
+# --- Konfiguration ---
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
+SYS_PROMPT = f"""Today's Date: {today_date}.You are Granite, developed by IBM. You are a helpful AI assistant"""
+TITLE = "IBM Granite 4 Tiny Preview served via llama-cpp-python"
+DESCRIPTION = """<p>Granite 4 Tiny is an open-source LLM supporting a 128k context window. This demo uses only 2K context.<span class="gr_docs_link"><a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a></span></p>"""
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.7
 TOP_P = 0.85
 TOP_K = 50
 REPETITION_PENALTY = 1.05
+CONTEXT_WINDOW = 2048 # Kontextfenstergröße setzen
+# --- Modell-Setup ---
+# Modell herunterladen
 gguf_name = "granite-4.0-tiny-preview-Q4_K_M.gguf"
+# Der Pfad, in dem das Modell gespeichert wird
+model_path = hf_hub_download(
+    repo_id="ibm-granite/granite-4.0-tiny-preview-GGUF",
+    filename=gguf_name,
+    local_dir="."
 )
+print(f"Model downloaded to: {model_path}")
+# Llama-Modell laden
+# Hinweis: Die Anzahl der Schichten, die auf die GPU entladen werden (n_gpu_layers),
+# sollte auf einen hohen Wert wie 999 gesetzt werden, um die gesamte GPU-Auslagerung zu erzwingen.
+# 'n_ctx' setzt die Kontextgröße.
+# 'chat_format' wird für die korrekte Formatierung der Konversation benötigt.
+try:
+    llama_model = Llama(
+        model_path=model_path,
+        n_ctx=CONTEXT_WINDOW,
+        n_gpu_layers=999, # Entlädt alle Schichten auf die GPU
+        chat_format="chatml", # Granite 4 Tiny verwendet ein Format, das dem ChatML-Standard ähnelt
+        verbose=False
+    )
+    print("Llama model initialized successfully.")
+except Exception as e:
+    print(f"Error initializing Llama model: {e}")
+    llama_model = None # Setze auf None, falls ein Fehler auftritt
+# --- Gradio-Funktionen ---
 custom_theme = ResearchMonochrome()
 @spaces.GPU(duration=30)
 def generate(
     top_k: float = TOP_K,
     max_new_tokens: int = MAX_NEW_TOKENS,
 ) -> Iterator[str]:
+    """Generierungsfunktion für Chat-Demo unter Verwendung von llama-cpp-python."""
+    if llama_model is None:
+        yield "Error: The model failed to initialize."
+        return
+    # 1. Nachrichten für llama-cpp-python aufbereiten
+    # llama-cpp-python erwartet ein OpenAI-Chat-Format
+    messages = []
+    messages.append({"role": "system", "content": SYS_PROMPT})
+    # Füge den Chatverlauf hinzu
+    for item in chat_history:
+        # Gradio speichert als Liste von Listen: [["user_msg", "assistant_msg"], ...]
+        # Die Struktur von `chat_history` ist jedoch als Liste von Dictionaries [..., {"role": "user", "content": "..."}]
+        # aus der Gradio ChatInterface-Dokumentation (typischerweise)
+        if item["role"] == "user":
+            messages.append({"role": "user", "content": item["content"]})
+        elif item["role"] == "assistant":
+            messages.append({"role": "assistant", "content": item["content"]})
+    # Füge die aktuelle Benutzernachricht hinzu
+    messages.append({"role": "user", "content": message})
+    # 2. Generierung starten
+    full_response = ""
     try:
+        # Verwende die OpenAI-kompatible Streaming-API von llama-cpp-python
+        stream = llama_model.create_chat_completion_openai_v1(
+            messages=messages,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            max_tokens=max_new_tokens,
+            repeat_penalty=repetition_penalty,
+            stop=["<|file_separator|>"], # Stopp-Token wie im Original-Code
+            stream=True
+        )
+        # 3. Streamen der Antwort
+        for chunk in stream:
+            if chunk and "choices" in chunk and len(chunk["choices"]) > 0:
+                delta = chunk["choices"][0]["delta"]
+                if "content" in delta:
+                    text = delta["content"]
+                    full_response += text
+                    yield full_response
     except Exception as e:
+        print(f"An error occurred during generation: {e}")
+        yield f"Error: {e}"
+# --- Gradio UI-Setup (Unverändert) ---
 css_file_path = Path(Path(__file__).parent / "app.css")
 # advanced settings (displayed in Accordion)
 temperature_slider = gr.Slider(
+    minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"])
 top_p_slider = gr.Slider(
+    minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"])
 top_k_slider = gr.Slider(
+    minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"])
 repetition_penalty_slider = gr.Slider(
     minimum=0,
     maximum=2.0,
     )
 if __name__ == "__main__":
+    demo.queue().launch()