Spaces:
Sleeping
Sleeping
Commit
Β·
079d1c0
1
Parent(s):
ccd721b
Force CPU-only environment and disable CUDA completely for HF Spaces CPU hardware
Browse files
app.py
CHANGED
@@ -18,6 +18,11 @@ import json
|
|
18 |
import time
|
19 |
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
20 |
|
|
|
|
|
|
|
|
|
|
|
21 |
# Load environment variables from .env file if it exists
|
22 |
try:
|
23 |
from dotenv import load_dotenv
|
@@ -103,7 +108,7 @@ def get_mcp_client(model_id: str, provider: str, api_key: Optional[str]) -> MCPC
|
|
103 |
return mcp_client
|
104 |
|
105 |
async def get_gpt_oss_model_and_tokenizer():
|
106 |
-
"""Get or create GPT-OSS-20B model and tokenizer with CPU-only loading."""
|
107 |
global gpt_oss_tokenizer, gpt_oss_model
|
108 |
|
109 |
# Check if already loaded
|
@@ -121,6 +126,10 @@ async def get_gpt_oss_model_and_tokenizer():
|
|
121 |
import torch
|
122 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
123 |
|
|
|
|
|
|
|
|
|
124 |
print("π Loading GPT-OSS-20B tokenizer...")
|
125 |
gpt_oss_tokenizer = AutoTokenizer.from_pretrained(
|
126 |
MODEL_ID,
|
@@ -128,15 +137,20 @@ async def get_gpt_oss_model_and_tokenizer():
|
|
128 |
)
|
129 |
|
130 |
print("π Loading GPT-OSS-20B model (CPU-only)...")
|
131 |
-
#
|
132 |
gpt_oss_model = AutoModelForCausalLM.from_pretrained(
|
133 |
MODEL_ID,
|
134 |
torch_dtype=torch.float32, # Use float32 for CPU compatibility
|
135 |
-
device_map=
|
136 |
trust_remote_code=True,
|
137 |
low_cpu_mem_usage=True,
|
|
|
|
|
138 |
)
|
139 |
|
|
|
|
|
|
|
140 |
# Set model to evaluation mode
|
141 |
gpt_oss_model.eval()
|
142 |
|
@@ -176,6 +190,9 @@ async def generate_with_gpt_oss(messages: List[Dict[str, Any]]) -> str:
|
|
176 |
return_tensors="pt",
|
177 |
)
|
178 |
|
|
|
|
|
|
|
179 |
# Generate with timeout protection
|
180 |
try:
|
181 |
import torch
|
|
|
18 |
import time
|
19 |
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
20 |
|
21 |
+
# Force CPU-only environment to avoid CUDA initialization
|
22 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
23 |
+
os.environ["USE_CUDA"] = "0"
|
24 |
+
os.environ["USE_GPU"] = "0"
|
25 |
+
|
26 |
# Load environment variables from .env file if it exists
|
27 |
try:
|
28 |
from dotenv import load_dotenv
|
|
|
108 |
return mcp_client
|
109 |
|
110 |
async def get_gpt_oss_model_and_tokenizer():
|
111 |
+
"""Get or create GPT-OSS-20B model and tokenizer with strict CPU-only loading."""
|
112 |
global gpt_oss_tokenizer, gpt_oss_model
|
113 |
|
114 |
# Check if already loaded
|
|
|
126 |
import torch
|
127 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
128 |
|
129 |
+
# Force CPU-only torch configuration
|
130 |
+
torch.cuda.is_available = lambda: False
|
131 |
+
torch.cuda.device_count = lambda: 0
|
132 |
+
|
133 |
print("π Loading GPT-OSS-20B tokenizer...")
|
134 |
gpt_oss_tokenizer = AutoTokenizer.from_pretrained(
|
135 |
MODEL_ID,
|
|
|
137 |
)
|
138 |
|
139 |
print("π Loading GPT-OSS-20B model (CPU-only)...")
|
140 |
+
# Strict CPU-only loading configuration
|
141 |
gpt_oss_model = AutoModelForCausalLM.from_pretrained(
|
142 |
MODEL_ID,
|
143 |
torch_dtype=torch.float32, # Use float32 for CPU compatibility
|
144 |
+
device_map=None, # Don't use device mapping
|
145 |
trust_remote_code=True,
|
146 |
low_cpu_mem_usage=True,
|
147 |
+
# Force CPU placement
|
148 |
+
**{"torch_dtype": torch.float32, "device": "cpu"}
|
149 |
)
|
150 |
|
151 |
+
# Explicitly move to CPU
|
152 |
+
gpt_oss_model = gpt_oss_model.to("cpu")
|
153 |
+
|
154 |
# Set model to evaluation mode
|
155 |
gpt_oss_model.eval()
|
156 |
|
|
|
190 |
return_tensors="pt",
|
191 |
)
|
192 |
|
193 |
+
# Ensure inputs are on CPU
|
194 |
+
inputs = {k: v.to("cpu") if hasattr(v, "to") else v for k, v in inputs.items()}
|
195 |
+
|
196 |
# Generate with timeout protection
|
197 |
try:
|
198 |
import torch
|