TobDeBer commited on
Commit
c3fb36e
·
verified ·
1 Parent(s): edb6fb6
Files changed (1) hide show
  1. app.py +89 -161
app.py CHANGED
@@ -1,113 +1,61 @@
1
  from collections.abc import Iterator
2
  from datetime import datetime
3
  from pathlib import Path
4
- from threading import Thread
5
- from huggingface_hub import hf_hub_download, login
6
- from themes.research_monochrome import ResearchMonochrome
7
  from typing import Iterator, List, Dict
8
-
 
9
  import spaces
10
- import os
11
- import requests
12
- import json
13
- import subprocess
14
  import gradio as gr
15
- import atexit
16
- import time
 
 
17
 
18
  today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
 
 
 
19
 
20
- SYS_PROMPT = f"""Today's Date: {today_date}.
21
- You are Granite, developed by IBM. You are a helpful AI assistant"""
22
- TITLE = "IBM Granite 4 Tiny Preview served from local GGUF server"
23
- DESCRIPTION = """
24
- <p>Granite 4 Tiny is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
25
- <span class="gr_docs_link">
26
- <a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a>
27
- </span>
28
- </p>
29
- """
30
- LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
31
  MAX_NEW_TOKENS = 1024
32
  TEMPERATURE = 0.7
33
  TOP_P = 0.85
34
  TOP_K = 50
35
  REPETITION_PENALTY = 1.05
 
36
 
37
- # Global variable to store the server process
38
- llama_process = None
39
-
40
- # Ensure the server process is killed when the application exits
41
- def cleanup_server():
42
- global llama_process
43
- if llama_process and llama_process.poll() is None:
44
- print("Stopping llama-server process...")
45
- llama_process.terminate()
46
- llama_process.wait(timeout=5)
47
- atexit.register(cleanup_server)
48
-
49
- # determine platform: CUDA or CPU
50
- try:
51
- subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
52
- platform = "CUDA"
53
- except subprocess.CalledProcessError:
54
- platform = "CPU"
55
- except FileNotFoundError:
56
- platform = "CPU"
57
-
58
- platform = "CUDA" # override for ZERO, because the GPU is not available at the time download decision is done
59
-
60
- print(f"Detected platform {platform}")
61
 
 
62
  gguf_name = "granite-4.0-tiny-preview-Q4_K_M.gguf"
63
- gguf_path = hf_hub_download(
64
- repo_id="ibm-granite/granite-4.0-tiny-preview-GGUF",
65
- filename=gguf_name,
66
- local_dir="."
67
- )
68
-
69
- # set exe_name depending on platform
70
- exe_name = "llama-server-6343-cuda" if platform == "CUDA" else "llama-server-6343-blas"
71
- exe_path = hf_hub_download(
72
- repo_id="TobDeBer/Skipper",
73
- filename=exe_name,
74
- local_dir="."
75
  )
76
- subprocess.run(["chmod", "+x", exe_name])
77
-
78
- # --- New Decorated Function to Launch Server on GPU ---
79
- @spaces.GPU(duration=30)
80
- def start_llama_server():
81
- global llama_process
82
-
83
- if llama_process and llama_process.poll() is None:
84
- print("Server is already running.")
85
- return
86
-
87
- server_env = os.environ.copy()
88
-
89
- # 1. Define the command (now explicitly using the CUDA binary)
90
- command = [
91
- "./" + exe_name,
92
- "-m", gguf_name,
93
- "--temp", "0.0",
94
- "-c", "2048",
95
- "-t", "8",
96
- "--port", "8081",
97
- "--no-warmup",
98
- "-ngl", "999" # <--- CRUCIAL: GPU offload instruction
99
- ]
100
-
101
- # 2. Launch the server now that the GPU is guaranteed to be available
102
- llama_process = subprocess.Popen(command, env=server_env)
103
- print(f"Llama-server process started with PID {llama_process.pid}")
104
 
105
- # You might need a small sleep here to wait for the server to initialize
106
- time.sleep(5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
 
108
 
109
  custom_theme = ResearchMonochrome()
110
- print("Theme type:", type(custom_theme))
111
 
112
  @spaces.GPU(duration=30)
113
  def generate(
@@ -119,90 +67,70 @@ def generate(
119
  top_k: float = TOP_K,
120
  max_new_tokens: int = MAX_NEW_TOKENS,
121
  ) -> Iterator[str]:
122
- """Generate function for chat demo using Llama.cpp server."""
123
-
124
- # Ensure the server is running before attempting a generation request
125
- # You'll need a more robust check in a production environment
126
- if llama_process is None or llama_process.poll() is not None:
127
- start_llama_server() # Restart if needed (or handle the error)
128
-
129
- # Build messages
130
- conversation = []
131
- conversation.append({"role": "system", "content": SYS_PROMPT})
132
- conversation += chat_history
133
- conversation.append({"role": "user", "content": message})
134
-
135
- # Prepare the prompt for the Llama.cpp server
136
- prompt = ""
137
- for item in conversation:
138
- if item["role"] == "system":
139
- prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n"
140
- elif item["role"] == "user":
141
- prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n"
142
- elif item["role"] == "assistant":
143
- prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n"
144
- prompt += "<|model|>\n" # Add the beginning token for the assistant
145
 
 
 
 
146
 
147
- # Construct the request payload
148
- payload = {
149
- "prompt": prompt,
150
- "stream": True, # Enable streaming
151
- "max_tokens": max_new_tokens,
152
- "temperature": temperature,
153
- "repeat_penalty": repetition_penalty,
154
- "top_p": top_p,
155
- "top_k": top_k,
156
- "stop": ["<|file_separator|>"], #stops after it sees this
157
- }
158
-
 
 
 
 
 
 
 
 
159
  try:
160
- # Make the request to the Llama.cpp server
161
- with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
162
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
163
-
164
- # Stream the response from the server
165
- outputs = []
166
- for line in response.iter_lines():
167
- if line:
168
- # Decode the line
169
- decoded_line = line.decode('utf-8')
170
- # Remove 'data: ' prefix if present
171
- if decoded_line.startswith("data: "):
172
- decoded_line = decoded_line[6:]
173
-
174
- # Handle potential JSON decoding errors
175
- try:
176
- json_data = json.loads(decoded_line)
177
- text = json_data.get("content", "") # Extract content field. crucial.
178
- if text:
179
- outputs.append(text)
180
- yield "".join(outputs)
181
-
182
- except json.JSONDecodeError:
183
- print(f"JSONDecodeError: {decoded_line}")
184
- # Handle the error, potentially skipping the line or logging it.
185
-
186
- except requests.exceptions.RequestException as e:
187
- print(f"Request failed: {e}")
188
- yield f"Error: {e}" # Yield an error message to the user
189
  except Exception as e:
190
- print(f"An unexpected error occurred: {e}")
191
- yield f"Error: {e}" # Yield error message
192
 
193
 
 
 
194
  css_file_path = Path(Path(__file__).parent / "app.css")
195
 
196
  # advanced settings (displayed in Accordion)
197
  temperature_slider = gr.Slider(
198
- minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
199
- )
200
  top_p_slider = gr.Slider(
201
- minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
202
- )
203
  top_k_slider = gr.Slider(
204
- minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
205
- )
206
  repetition_penalty_slider = gr.Slider(
207
  minimum=0,
208
  maximum=2.0,
@@ -253,4 +181,4 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=custom_theme, ti
253
  )
254
 
255
  if __name__ == "__main__":
256
- demo.queue().launch()
 
1
  from collections.abc import Iterator
2
  from datetime import datetime
3
  from pathlib import Path
 
 
 
4
  from typing import Iterator, List, Dict
5
+ from huggingface_hub import hf_hub_download
6
+ from themes.research_monochrome import ResearchMonochrome
7
  import spaces
 
 
 
 
8
  import gradio as gr
9
+ from llama_cpp import Llama # <-- Neu: Llama-Klasse importieren
10
+ import os
11
+
12
+ # --- Konfiguration ---
13
 
14
  today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
15
+ SYS_PROMPT = f"""Today's Date: {today_date}.You are Granite, developed by IBM. You are a helpful AI assistant"""
16
+ TITLE = "IBM Granite 4 Tiny Preview served via llama-cpp-python"
17
+ DESCRIPTION = """<p>Granite 4 Tiny is an open-source LLM supporting a 128k context window. This demo uses only 2K context.<span class="gr_docs_link"><a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a></span></p>"""
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  MAX_NEW_TOKENS = 1024
20
  TEMPERATURE = 0.7
21
  TOP_P = 0.85
22
  TOP_K = 50
23
  REPETITION_PENALTY = 1.05
24
+ CONTEXT_WINDOW = 2048 # Kontextfenstergröße setzen
25
 
26
+ # --- Modell-Setup ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Modell herunterladen
29
  gguf_name = "granite-4.0-tiny-preview-Q4_K_M.gguf"
30
+ # Der Pfad, in dem das Modell gespeichert wird
31
+ model_path = hf_hub_download(
32
+ repo_id="ibm-granite/granite-4.0-tiny-preview-GGUF",
33
+ filename=gguf_name,
34
+ local_dir="."
 
 
 
 
 
 
 
35
  )
36
+ print(f"Model downloaded to: {model_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Llama-Modell laden
39
+ # Hinweis: Die Anzahl der Schichten, die auf die GPU entladen werden (n_gpu_layers),
40
+ # sollte auf einen hohen Wert wie 999 gesetzt werden, um die gesamte GPU-Auslagerung zu erzwingen.
41
+ # 'n_ctx' setzt die Kontextgröße.
42
+ # 'chat_format' wird für die korrekte Formatierung der Konversation benötigt.
43
+ try:
44
+ llama_model = Llama(
45
+ model_path=model_path,
46
+ n_ctx=CONTEXT_WINDOW,
47
+ n_gpu_layers=999, # Entlädt alle Schichten auf die GPU
48
+ chat_format="chatml", # Granite 4 Tiny verwendet ein Format, das dem ChatML-Standard ähnelt
49
+ verbose=False
50
+ )
51
+ print("Llama model initialized successfully.")
52
+ except Exception as e:
53
+ print(f"Error initializing Llama model: {e}")
54
+ llama_model = None # Setze auf None, falls ein Fehler auftritt
55
 
56
+ # --- Gradio-Funktionen ---
57
 
58
  custom_theme = ResearchMonochrome()
 
59
 
60
  @spaces.GPU(duration=30)
61
  def generate(
 
67
  top_k: float = TOP_K,
68
  max_new_tokens: int = MAX_NEW_TOKENS,
69
  ) -> Iterator[str]:
70
+ """Generierungsfunktion für Chat-Demo unter Verwendung von llama-cpp-python."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ if llama_model is None:
73
+ yield "Error: The model failed to initialize."
74
+ return
75
 
76
+ # 1. Nachrichten für llama-cpp-python aufbereiten
77
+ # llama-cpp-python erwartet ein OpenAI-Chat-Format
78
+ messages = []
79
+ messages.append({"role": "system", "content": SYS_PROMPT})
80
+
81
+ # Füge den Chatverlauf hinzu
82
+ for item in chat_history:
83
+ # Gradio speichert als Liste von Listen: [["user_msg", "assistant_msg"], ...]
84
+ # Die Struktur von `chat_history` ist jedoch als Liste von Dictionaries [..., {"role": "user", "content": "..."}]
85
+ # aus der Gradio ChatInterface-Dokumentation (typischerweise)
86
+ if item["role"] == "user":
87
+ messages.append({"role": "user", "content": item["content"]})
88
+ elif item["role"] == "assistant":
89
+ messages.append({"role": "assistant", "content": item["content"]})
90
+
91
+ # Füge die aktuelle Benutzernachricht hinzu
92
+ messages.append({"role": "user", "content": message})
93
+
94
+ # 2. Generierung starten
95
+ full_response = ""
96
  try:
97
+ # Verwende die OpenAI-kompatible Streaming-API von llama-cpp-python
98
+ stream = llama_model.create_chat_completion_openai_v1(
99
+ messages=messages,
100
+ temperature=temperature,
101
+ top_p=top_p,
102
+ top_k=top_k,
103
+ max_tokens=max_new_tokens,
104
+ repeat_penalty=repetition_penalty,
105
+ stop=["<|file_separator|>"], # Stopp-Token wie im Original-Code
106
+ stream=True
107
+ )
108
+
109
+ # 3. Streamen der Antwort
110
+ for chunk in stream:
111
+ if chunk and "choices" in chunk and len(chunk["choices"]) > 0:
112
+ delta = chunk["choices"][0]["delta"]
113
+ if "content" in delta:
114
+ text = delta["content"]
115
+ full_response += text
116
+ yield full_response
117
+
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
+ print(f"An error occurred during generation: {e}")
120
+ yield f"Error: {e}"
121
 
122
 
123
+ # --- Gradio UI-Setup (Unverändert) ---
124
+
125
  css_file_path = Path(Path(__file__).parent / "app.css")
126
 
127
  # advanced settings (displayed in Accordion)
128
  temperature_slider = gr.Slider(
129
+ minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"])
 
130
  top_p_slider = gr.Slider(
131
+ minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"])
 
132
  top_k_slider = gr.Slider(
133
+ minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"])
 
134
  repetition_penalty_slider = gr.Slider(
135
  minimum=0,
136
  maximum=2.0,
 
181
  )
182
 
183
  if __name__ == "__main__":
184
+ demo.queue().launch()