TobDeBer commited on
Commit
fed41be
·
verified ·
1 Parent(s): a972051

zero adapt

Browse files
Files changed (1) hide show
  1. app.py +43 -10
app.py CHANGED
@@ -12,6 +12,7 @@ import requests
12
  import json
13
  import subprocess
14
  import gradio as gr
 
15
 
16
  today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
17
 
@@ -32,6 +33,18 @@ TOP_P = 0.85
32
  TOP_K = 50
33
  REPETITION_PENALTY = 1.05
34
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # determine platform: CUDA or CPU
36
  try:
37
  subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
@@ -59,17 +72,37 @@ exe_path = hf_hub_download(
59
  filename=exe_name,
60
  local_dir="."
61
  )
62
- server_env = os.environ.copy()
63
-
64
- # start llama-server
65
  subprocess.run(["chmod", "+x", exe_name])
66
- command = ["./" + exe_name, "-m", gguf_name, "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"]
67
- if platform == "CUDA":
68
- # -ngl 999 tells the server to attempt to offload 999 layers (essentially all of them)
69
- # to the GPU for execution. Without this, it runs on the CPU, even with the CUDA binary.
70
- command.extend(["-ngl", "999"])
71
- process = subprocess.Popen(command, env=server_env)
72
- print(f"Llama-server process started with PID {process.pid}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  custom_theme = ResearchMonochrome()
75
  print("Theme type:", type(custom_theme))
 
12
  import json
13
  import subprocess
14
  import gradio as gr
15
+ import atexit
16
 
17
  today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
18
 
 
33
  TOP_K = 50
34
  REPETITION_PENALTY = 1.05
35
 
36
+ # Global variable to store the server process
37
+ llama_process = None
38
+
39
+ # Ensure the server process is killed when the application exits
40
+ def cleanup_server():
41
+ global llama_process
42
+ if llama_process and llama_process.poll() is None:
43
+ print("Stopping llama-server process...")
44
+ llama_process.terminate()
45
+ llama_process.wait(timeout=5)
46
+ atexit.register(cleanup_server)
47
+
48
  # determine platform: CUDA or CPU
49
  try:
50
  subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
 
72
  filename=exe_name,
73
  local_dir="."
74
  )
 
 
 
75
  subprocess.run(["chmod", "+x", exe_name])
76
+
77
+ # --- New Decorated Function to Launch Server on GPU ---
78
+ @spaces.GPU(duration=30)
79
+ def start_llama_server():
80
+ global llama_process
81
+
82
+ if llama_process and llama_process.poll() is None:
83
+ print("Server is already running.")
84
+ return
85
+
86
+ server_env = os.environ.copy()
87
+
88
+ # 1. Define the command (now explicitly using the CUDA binary)
89
+ command = [
90
+ "./" + exe_name,
91
+ "-m", gguf_name,
92
+ "--temp", "0.0",
93
+ "-c", "2048",
94
+ "-t", "8",
95
+ "--port", "8081",
96
+ "-ngl", "999" # <--- CRUCIAL: GPU offload instruction
97
+ ]
98
+
99
+ # 2. Launch the server now that the GPU is guaranteed to be available
100
+ llama_process = subprocess.Popen(command, env=server_env)
101
+ print(f"Llama-server process started with PID {llama_process.pid}")
102
+
103
+ # You might need a small sleep here to wait for the server to initialize
104
+ # time.sleep(5)
105
+
106
 
107
  custom_theme = ResearchMonochrome()
108
  print("Theme type:", type(custom_theme))