Spaces:
Runtime error
Runtime error
zero adapt
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ import requests
|
|
12 |
import json
|
13 |
import subprocess
|
14 |
import gradio as gr
|
|
|
15 |
|
16 |
today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
|
17 |
|
@@ -32,6 +33,18 @@ TOP_P = 0.85
|
|
32 |
TOP_K = 50
|
33 |
REPETITION_PENALTY = 1.05
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# determine platform: CUDA or CPU
|
36 |
try:
|
37 |
subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
|
@@ -59,17 +72,37 @@ exe_path = hf_hub_download(
|
|
59 |
filename=exe_name,
|
60 |
local_dir="."
|
61 |
)
|
62 |
-
server_env = os.environ.copy()
|
63 |
-
|
64 |
-
# start llama-server
|
65 |
subprocess.run(["chmod", "+x", exe_name])
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
custom_theme = ResearchMonochrome()
|
75 |
print("Theme type:", type(custom_theme))
|
|
|
12 |
import json
|
13 |
import subprocess
|
14 |
import gradio as gr
|
15 |
+
import atexit
|
16 |
|
17 |
today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
|
18 |
|
|
|
33 |
TOP_K = 50
|
34 |
REPETITION_PENALTY = 1.05
|
35 |
|
36 |
+
# Global variable to store the server process
|
37 |
+
llama_process = None
|
38 |
+
|
39 |
+
# Ensure the server process is killed when the application exits
|
40 |
+
def cleanup_server():
|
41 |
+
global llama_process
|
42 |
+
if llama_process and llama_process.poll() is None:
|
43 |
+
print("Stopping llama-server process...")
|
44 |
+
llama_process.terminate()
|
45 |
+
llama_process.wait(timeout=5)
|
46 |
+
atexit.register(cleanup_server)
|
47 |
+
|
48 |
# determine platform: CUDA or CPU
|
49 |
try:
|
50 |
subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
|
|
|
72 |
filename=exe_name,
|
73 |
local_dir="."
|
74 |
)
|
|
|
|
|
|
|
75 |
subprocess.run(["chmod", "+x", exe_name])
|
76 |
+
|
77 |
+
# --- New Decorated Function to Launch Server on GPU ---
|
78 |
+
@spaces.GPU(duration=30)
|
79 |
+
def start_llama_server():
|
80 |
+
global llama_process
|
81 |
+
|
82 |
+
if llama_process and llama_process.poll() is None:
|
83 |
+
print("Server is already running.")
|
84 |
+
return
|
85 |
+
|
86 |
+
server_env = os.environ.copy()
|
87 |
+
|
88 |
+
# 1. Define the command (now explicitly using the CUDA binary)
|
89 |
+
command = [
|
90 |
+
"./" + exe_name,
|
91 |
+
"-m", gguf_name,
|
92 |
+
"--temp", "0.0",
|
93 |
+
"-c", "2048",
|
94 |
+
"-t", "8",
|
95 |
+
"--port", "8081",
|
96 |
+
"-ngl", "999" # <--- CRUCIAL: GPU offload instruction
|
97 |
+
]
|
98 |
+
|
99 |
+
# 2. Launch the server now that the GPU is guaranteed to be available
|
100 |
+
llama_process = subprocess.Popen(command, env=server_env)
|
101 |
+
print(f"Llama-server process started with PID {llama_process.pid}")
|
102 |
+
|
103 |
+
# You might need a small sleep here to wait for the server to initialize
|
104 |
+
# time.sleep(5)
|
105 |
+
|
106 |
|
107 |
custom_theme = ResearchMonochrome()
|
108 |
print("Theme type:", type(custom_theme))
|