sudipta26889 commited on
Commit
079d1c0
Β·
1 Parent(s): ccd721b

Force CPU-only environment and disable CUDA completely for HF Spaces CPU hardware

Browse files
Files changed (1) hide show
  1. app.py +20 -3
app.py CHANGED
@@ -18,6 +18,11 @@ import json
18
  import time
19
  from typing import Any, Dict, Iterable, List, Optional, Tuple
20
 
 
 
 
 
 
21
  # Load environment variables from .env file if it exists
22
  try:
23
  from dotenv import load_dotenv
@@ -103,7 +108,7 @@ def get_mcp_client(model_id: str, provider: str, api_key: Optional[str]) -> MCPC
103
  return mcp_client
104
 
105
  async def get_gpt_oss_model_and_tokenizer():
106
- """Get or create GPT-OSS-20B model and tokenizer with CPU-only loading."""
107
  global gpt_oss_tokenizer, gpt_oss_model
108
 
109
  # Check if already loaded
@@ -121,6 +126,10 @@ async def get_gpt_oss_model_and_tokenizer():
121
  import torch
122
  from transformers import AutoTokenizer, AutoModelForCausalLM
123
 
 
 
 
 
124
  print("πŸ”„ Loading GPT-OSS-20B tokenizer...")
125
  gpt_oss_tokenizer = AutoTokenizer.from_pretrained(
126
  MODEL_ID,
@@ -128,15 +137,20 @@ async def get_gpt_oss_model_and_tokenizer():
128
  )
129
 
130
  print("πŸ”„ Loading GPT-OSS-20B model (CPU-only)...")
131
- # Force CPU-only loading to avoid CUDA initialization issues
132
  gpt_oss_model = AutoModelForCausalLM.from_pretrained(
133
  MODEL_ID,
134
  torch_dtype=torch.float32, # Use float32 for CPU compatibility
135
- device_map="cpu", # Force CPU loading
136
  trust_remote_code=True,
137
  low_cpu_mem_usage=True,
 
 
138
  )
139
 
 
 
 
140
  # Set model to evaluation mode
141
  gpt_oss_model.eval()
142
 
@@ -176,6 +190,9 @@ async def generate_with_gpt_oss(messages: List[Dict[str, Any]]) -> str:
176
  return_tensors="pt",
177
  )
178
 
 
 
 
179
  # Generate with timeout protection
180
  try:
181
  import torch
 
18
  import time
19
  from typing import Any, Dict, Iterable, List, Optional, Tuple
20
 
21
+ # Force CPU-only environment to avoid CUDA initialization
22
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
23
+ os.environ["USE_CUDA"] = "0"
24
+ os.environ["USE_GPU"] = "0"
25
+
26
  # Load environment variables from .env file if it exists
27
  try:
28
  from dotenv import load_dotenv
 
108
  return mcp_client
109
 
110
  async def get_gpt_oss_model_and_tokenizer():
111
+ """Get or create GPT-OSS-20B model and tokenizer with strict CPU-only loading."""
112
  global gpt_oss_tokenizer, gpt_oss_model
113
 
114
  # Check if already loaded
 
126
  import torch
127
  from transformers import AutoTokenizer, AutoModelForCausalLM
128
 
129
+ # Force CPU-only torch configuration
130
+ torch.cuda.is_available = lambda: False
131
+ torch.cuda.device_count = lambda: 0
132
+
133
  print("πŸ”„ Loading GPT-OSS-20B tokenizer...")
134
  gpt_oss_tokenizer = AutoTokenizer.from_pretrained(
135
  MODEL_ID,
 
137
  )
138
 
139
  print("πŸ”„ Loading GPT-OSS-20B model (CPU-only)...")
140
+ # Strict CPU-only loading configuration
141
  gpt_oss_model = AutoModelForCausalLM.from_pretrained(
142
  MODEL_ID,
143
  torch_dtype=torch.float32, # Use float32 for CPU compatibility
144
+ device_map=None, # Don't use device mapping
145
  trust_remote_code=True,
146
  low_cpu_mem_usage=True,
147
+ # Force CPU placement
148
+ **{"torch_dtype": torch.float32, "device": "cpu"}
149
  )
150
 
151
+ # Explicitly move to CPU
152
+ gpt_oss_model = gpt_oss_model.to("cpu")
153
+
154
  # Set model to evaluation mode
155
  gpt_oss_model.eval()
156
 
 
190
  return_tensors="pt",
191
  )
192
 
193
+ # Ensure inputs are on CPU
194
+ inputs = {k: v.to("cpu") if hasattr(v, "to") else v for k, v in inputs.items()}
195
+
196
  # Generate with timeout protection
197
  try:
198
  import torch