Granite4Tiny

Runtime error

App Files Files Community

TobDeBer commited on 4 days ago

Commit

fed41be

verified ·

1 Parent(s): a972051

zero adapt

Browse files

Files changed (1) hide show

app.py +43 -10

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import requests
 import json
 import subprocess
 import gradio as gr
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
@@ -32,6 +33,18 @@ TOP_P = 0.85
 TOP_K = 50
 REPETITION_PENALTY = 1.05
 # determine platform: CUDA or CPU
 try:
     subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
@@ -59,17 +72,37 @@ exe_path = hf_hub_download(
             filename=exe_name,
             local_dir="."
 )
-server_env = os.environ.copy()
-# start llama-server
 subprocess.run(["chmod", "+x", exe_name])
-command = ["./" + exe_name, "-m", gguf_name, "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"]
-if platform == "CUDA":
-    # -ngl 999 tells the server to attempt to offload 999 layers (essentially all of them)
-    # to the GPU for execution. Without this, it runs on the CPU, even with the CUDA binary.
-    command.extend(["-ngl", "999"])
-process = subprocess.Popen(command, env=server_env)
-print(f"Llama-server process started with PID {process.pid}")
 custom_theme = ResearchMonochrome()
 print("Theme type:", type(custom_theme))

 import json
 import subprocess
 import gradio as gr
+import atexit
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
 TOP_K = 50
 REPETITION_PENALTY = 1.05
+# Global variable to store the server process
+llama_process = None
+# Ensure the server process is killed when the application exits
+def cleanup_server():
+    global llama_process
+    if llama_process and llama_process.poll() is None:
+        print("Stopping llama-server process...")
+        llama_process.terminate()
+        llama_process.wait(timeout=5)
+atexit.register(cleanup_server)
 # determine platform: CUDA or CPU
 try:
     subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
             filename=exe_name,
             local_dir="."
 )
 subprocess.run(["chmod", "+x", exe_name])
+# --- New Decorated Function to Launch Server on GPU ---
+@spaces.GPU(duration=30)
+def start_llama_server():
+    global llama_process
+    if llama_process and llama_process.poll() is None:
+        print("Server is already running.")
+        return
+    server_env = os.environ.copy()
+    # 1. Define the command (now explicitly using the CUDA binary)
+    command = [
+        "./" + exe_name,
+        "-m", gguf_name,
+        "--temp", "0.0",
+        "-c", "2048",
+        "-t", "8",
+        "--port", "8081",
+        "-ngl", "999"  # <--- CRUCIAL: GPU offload instruction
+    ]
+    # 2. Launch the server now that the GPU is guaranteed to be available
+    llama_process = subprocess.Popen(command, env=server_env)
+    print(f"Llama-server process started with PID {llama_process.pid}")
+    # You might need a small sleep here to wait for the server to initialize
+    # time.sleep(5)
 custom_theme = ResearchMonochrome()
 print("Theme type:", type(custom_theme))