ndc8
commited on
Commit
·
3960f0f
1
Parent(s):
78b611a
rabbit-ed
Browse files- README.md +1 -0
- gemma_gguf_backend.py +82 -9
- space.yaml +1 -1
- training/train_gemma_unsloth.py +41 -14
- training_runs/devlocal/meta.json +2 -2
README.md
CHANGED
|
@@ -431,4 +431,5 @@ To run with a real model locally:
|
|
| 431 |
```
|
| 432 |
|
| 433 |
## License
|
|
|
|
| 434 |
Apache 2.0
|
|
|
|
| 431 |
```
|
| 432 |
|
| 433 |
## License
|
| 434 |
+
|
| 435 |
Apache 2.0
|
gemma_gguf_backend.py
CHANGED
|
@@ -14,8 +14,9 @@ import sys
|
|
| 14 |
import subprocess
|
| 15 |
import threading
|
| 16 |
from pathlib import Path
|
|
|
|
| 17 |
|
| 18 |
-
from fastapi import FastAPI, HTTPException
|
| 19 |
from fastapi.responses import JSONResponse
|
| 20 |
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
from pydantic import BaseModel, Field, field_validator
|
|
@@ -28,6 +29,8 @@ except ImportError:
|
|
| 28 |
llama_cpp_available = False
|
| 29 |
|
| 30 |
import uvicorn
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Configure logging
|
| 33 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -72,6 +75,7 @@ class HealthResponse(BaseModel):
|
|
| 72 |
version: str
|
| 73 |
backend: str
|
| 74 |
|
|
|
|
| 75 |
# Global variables for model management
|
| 76 |
current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
|
| 77 |
llm = None
|
|
@@ -277,19 +281,57 @@ async def create_chat_completion(
|
|
| 277 |
# Training Job Management (Unsloth)
|
| 278 |
# -----------------------------
|
| 279 |
|
| 280 |
-
#
|
| 281 |
TRAIN_JOBS: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
TRAIN_DIR = Path(os.environ.get("TRAIN_DIR", "./training_runs")).resolve()
|
| 283 |
TRAIN_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| 284 |
|
| 285 |
def _start_training_subprocess(job_id: str, args: Dict[str, Any]) -> subprocess.Popen[Any]:
|
| 286 |
"""Spawn a subprocess to run the Unsloth fine-tuning script."""
|
| 287 |
logs_dir = TRAIN_DIR / job_id
|
| 288 |
logs_dir.mkdir(parents=True, exist_ok=True)
|
| 289 |
log_file = open(logs_dir / "train.log", "w", encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
# Build absolute script path to avoid module/package resolution issues
|
| 292 |
script_path = (Path(__file__).parent / "training" / "train_gemma_unsloth.py").resolve()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
python_exec = sys.executable
|
| 294 |
|
| 295 |
cmd = [
|
|
@@ -338,6 +380,15 @@ def _watch_process(job_id: str, proc: subprocess.Popen[Any]):
|
|
| 338 |
TRAIN_JOBS[job_id]["status"] = status
|
| 339 |
TRAIN_JOBS[job_id]["return_code"] = return_code
|
| 340 |
TRAIN_JOBS[job_id]["ended_at"] = int(time.time())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
logger.info(f"🏁 Training job {job_id} finished with status={status}, code={return_code}")
|
| 342 |
|
| 343 |
class StartTrainingRequest(BaseModel):
|
|
@@ -376,6 +427,13 @@ class TrainStatusResponse(BaseModel):
|
|
| 376 |
@app.post("/train/start", response_model=StartTrainingResponse)
|
| 377 |
def start_training(req: StartTrainingRequest):
|
| 378 |
"""Start a background Unsloth fine-tuning job. Returns a job_id to poll."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
job_id = uuid.uuid4().hex[:12]
|
| 380 |
now = int(time.time())
|
| 381 |
output_dir = str((TRAIN_DIR / job_id).resolve())
|
|
@@ -386,18 +444,21 @@ def start_training(req: StartTrainingRequest):
|
|
| 386 |
"args": req.model_dump(),
|
| 387 |
"output_dir": output_dir,
|
| 388 |
}
|
|
|
|
| 389 |
|
| 390 |
try:
|
| 391 |
proc = _start_training_subprocess(job_id, req.model_dump())
|
| 392 |
TRAIN_JOBS[job_id]["status"] = "running"
|
| 393 |
TRAIN_JOBS[job_id]["pid"] = proc.pid
|
|
|
|
| 394 |
watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
|
| 395 |
watcher.start()
|
| 396 |
return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
|
| 397 |
except Exception as e:
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
|
|
|
| 401 |
|
| 402 |
@app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
|
| 403 |
def train_status(job_id: str):
|
|
@@ -415,7 +476,10 @@ def train_status(job_id: str):
|
|
| 415 |
)
|
| 416 |
|
| 417 |
@app.get("/train/logs/{job_id}")
|
| 418 |
-
def train_logs(
|
|
|
|
|
|
|
|
|
|
| 419 |
job = TRAIN_JOBS.get(job_id)
|
| 420 |
if not job:
|
| 421 |
raise HTTPException(status_code=404, detail="Job not found")
|
|
@@ -438,11 +502,20 @@ def train_stop(job_id: str):
|
|
| 438 |
if not pid:
|
| 439 |
raise HTTPException(status_code=400, detail="Job does not have an active PID")
|
| 440 |
try:
|
| 441 |
-
os.kill(pid,
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
except Exception as e:
|
| 445 |
raise HTTPException(status_code=500, detail=f"Failed to stop job: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
|
| 447 |
# Main entry point
|
| 448 |
if __name__ == "__main__":
|
|
|
|
| 14 |
import subprocess
|
| 15 |
import threading
|
| 16 |
from pathlib import Path
|
| 17 |
+
import signal # Use signal.SIGTERM for process termination
|
| 18 |
|
| 19 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 20 |
from fastapi.responses import JSONResponse
|
| 21 |
from fastapi.middleware.cors import CORSMiddleware
|
| 22 |
from pydantic import BaseModel, Field, field_validator
|
|
|
|
| 29 |
llama_cpp_available = False
|
| 30 |
|
| 31 |
import uvicorn
|
| 32 |
+
import sqlite3
|
| 33 |
+
import json # For persisting job metadata
|
| 34 |
|
| 35 |
# Configure logging
|
| 36 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 75 |
version: str
|
| 76 |
backend: str
|
| 77 |
|
| 78 |
+
from pathlib import Path
|
| 79 |
# Global variables for model management
|
| 80 |
current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
|
| 81 |
llm = None
|
|
|
|
| 281 |
# Training Job Management (Unsloth)
|
| 282 |
# -----------------------------
|
| 283 |
|
| 284 |
+
# Persistent job store: in-memory dict backed by SQLite
|
| 285 |
TRAIN_JOBS: Dict[str, Dict[str, Any]] = {}
|
| 286 |
+
# Initialize SQLite DB for job persistence
|
| 287 |
+
DB_PATH = Path(os.environ.get("JOB_DB_PATH", "./jobs.db"))
|
| 288 |
+
conn = sqlite3.connect(str(DB_PATH), check_same_thread=False)
|
| 289 |
+
cursor = conn.cursor()
|
| 290 |
+
cursor.execute(
|
| 291 |
+
"""
|
| 292 |
+
CREATE TABLE IF NOT EXISTS jobs (
|
| 293 |
+
job_id TEXT PRIMARY KEY,
|
| 294 |
+
data TEXT NOT NULL
|
| 295 |
+
)
|
| 296 |
+
"""
|
| 297 |
+
)
|
| 298 |
+
conn.commit()
|
| 299 |
+
|
| 300 |
+
def load_jobs() -> None:
|
| 301 |
+
cursor.execute("SELECT job_id, data FROM jobs")
|
| 302 |
+
for job_id, data in cursor.fetchall():
|
| 303 |
+
TRAIN_JOBS[job_id] = json.loads(data)
|
| 304 |
+
|
| 305 |
+
def save_job(job_id: str) -> None:
|
| 306 |
+
cursor.execute(
|
| 307 |
+
"INSERT OR REPLACE INTO jobs (job_id, data) VALUES (?, ?)",
|
| 308 |
+
(job_id, json.dumps(TRAIN_JOBS[job_id]))
|
| 309 |
+
)
|
| 310 |
+
conn.commit()
|
| 311 |
+
|
| 312 |
+
# Load existing jobs on startup
|
| 313 |
+
load_jobs()
|
| 314 |
+
|
| 315 |
TRAIN_DIR = Path(os.environ.get("TRAIN_DIR", "./training_runs")).resolve()
|
| 316 |
TRAIN_DIR.mkdir(parents=True, exist_ok=True)
|
| 317 |
+
# Maximum concurrent training jobs
|
| 318 |
+
MAX_CONCURRENT_JOBS = int(os.environ.get("MAX_CONCURRENT_JOBS", "5"))
|
| 319 |
|
| 320 |
def _start_training_subprocess(job_id: str, args: Dict[str, Any]) -> subprocess.Popen[Any]:
|
| 321 |
"""Spawn a subprocess to run the Unsloth fine-tuning script."""
|
| 322 |
logs_dir = TRAIN_DIR / job_id
|
| 323 |
logs_dir.mkdir(parents=True, exist_ok=True)
|
| 324 |
log_file = open(logs_dir / "train.log", "w", encoding="utf-8")
|
| 325 |
+
# Store log file handle to close later
|
| 326 |
+
TRAIN_JOBS.setdefault(job_id, {})["log_file"] = log_file
|
| 327 |
+
save_job(job_id)
|
| 328 |
|
| 329 |
# Build absolute script path to avoid module/package resolution issues
|
| 330 |
script_path = (Path(__file__).parent / "training" / "train_gemma_unsloth.py").resolve()
|
| 331 |
+
# Verify training script exists
|
| 332 |
+
if not script_path.exists():
|
| 333 |
+
logger.error(f"Training script not found at {script_path}")
|
| 334 |
+
raise HTTPException(status_code=500, detail=f"Training script not found at {script_path}")
|
| 335 |
python_exec = sys.executable
|
| 336 |
|
| 337 |
cmd = [
|
|
|
|
| 380 |
TRAIN_JOBS[job_id]["status"] = status
|
| 381 |
TRAIN_JOBS[job_id]["return_code"] = return_code
|
| 382 |
TRAIN_JOBS[job_id]["ended_at"] = int(time.time())
|
| 383 |
+
# Persist updated job status
|
| 384 |
+
save_job(job_id)
|
| 385 |
+
# Close the log file handle to prevent resource leaks
|
| 386 |
+
log_file = TRAIN_JOBS[job_id].get("log_file")
|
| 387 |
+
if log_file:
|
| 388 |
+
try:
|
| 389 |
+
log_file.close()
|
| 390 |
+
except Exception as close_err:
|
| 391 |
+
logger.warning(f"Failed to close log file for job {job_id}: {close_err}")
|
| 392 |
logger.info(f"🏁 Training job {job_id} finished with status={status}, code={return_code}")
|
| 393 |
|
| 394 |
class StartTrainingRequest(BaseModel):
|
|
|
|
| 427 |
@app.post("/train/start", response_model=StartTrainingResponse)
|
| 428 |
def start_training(req: StartTrainingRequest):
|
| 429 |
"""Start a background Unsloth fine-tuning job. Returns a job_id to poll."""
|
| 430 |
+
# Enforce maximum concurrent training jobs
|
| 431 |
+
running_jobs = sum(1 for job in TRAIN_JOBS.values() if job.get("status") == "running")
|
| 432 |
+
if running_jobs >= MAX_CONCURRENT_JOBS:
|
| 433 |
+
raise HTTPException(
|
| 434 |
+
status_code=429,
|
| 435 |
+
detail=f"Maximum concurrent training jobs reached ({MAX_CONCURRENT_JOBS}). Try again later."
|
| 436 |
+
)
|
| 437 |
job_id = uuid.uuid4().hex[:12]
|
| 438 |
now = int(time.time())
|
| 439 |
output_dir = str((TRAIN_DIR / job_id).resolve())
|
|
|
|
| 444 |
"args": req.model_dump(),
|
| 445 |
"output_dir": output_dir,
|
| 446 |
}
|
| 447 |
+
save_job(job_id)
|
| 448 |
|
| 449 |
try:
|
| 450 |
proc = _start_training_subprocess(job_id, req.model_dump())
|
| 451 |
TRAIN_JOBS[job_id]["status"] = "running"
|
| 452 |
TRAIN_JOBS[job_id]["pid"] = proc.pid
|
| 453 |
+
save_job(job_id)
|
| 454 |
watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
|
| 455 |
watcher.start()
|
| 456 |
return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
|
| 457 |
except Exception as e:
|
| 458 |
+
logger.exception("Failed to start training job")
|
| 459 |
+
TRAIN_JOBS[job_id]["status"] = "failed_to_start"
|
| 460 |
+
save_job(job_id)
|
| 461 |
+
raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
|
| 462 |
|
| 463 |
@app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
|
| 464 |
def train_status(job_id: str):
|
|
|
|
| 476 |
)
|
| 477 |
|
| 478 |
@app.get("/train/logs/{job_id}")
|
| 479 |
+
def train_logs(
|
| 480 |
+
job_id: str,
|
| 481 |
+
tail: int = Query(200, ge=0, le=1000, description="Number of lines to tail, between 0 and 1000"),
|
| 482 |
+
):
|
| 483 |
job = TRAIN_JOBS.get(job_id)
|
| 484 |
if not job:
|
| 485 |
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
| 502 |
if not pid:
|
| 503 |
raise HTTPException(status_code=400, detail="Job does not have an active PID")
|
| 504 |
try:
|
| 505 |
+
os.kill(pid, signal.SIGTERM)
|
| 506 |
+
except ProcessLookupError:
|
| 507 |
+
logger.warning(
|
| 508 |
+
f"Process {pid} for job {job_id} not found; may have exited already"
|
| 509 |
+
)
|
| 510 |
+
job["status"] = "stopping_failed"
|
| 511 |
+
save_job(job_id)
|
| 512 |
+
return {"job_id": job_id, "status": job["status"]}
|
| 513 |
except Exception as e:
|
| 514 |
raise HTTPException(status_code=500, detail=f"Failed to stop job: {e}")
|
| 515 |
+
else:
|
| 516 |
+
job["status"] = "stopping"
|
| 517 |
+
save_job(job_id)
|
| 518 |
+
return {"job_id": job_id, "status": "stopping"}
|
| 519 |
|
| 520 |
# Main entry point
|
| 521 |
if __name__ == "__main__":
|
space.yaml
CHANGED
|
@@ -2,4 +2,4 @@ sdk: fastapi
|
|
| 2 |
python_version: 3.10
|
| 3 |
app_file: gemma_gguf_backend.py
|
| 4 |
env:
|
| 5 |
-
- DEMO_MODE=
|
|
|
|
| 2 |
python_version: 3.10
|
| 3 |
app_file: gemma_gguf_backend.py
|
| 4 |
env:
|
| 5 |
+
- DEMO_MODE=0 # Ensure model loads properly in production
|
training/train_gemma_unsloth.py
CHANGED
|
@@ -12,6 +12,9 @@ import json
|
|
| 12 |
import time
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Any, Dict
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Lazy imports to keep API light
|
| 17 |
|
|
@@ -40,7 +43,12 @@ def _import_training_libs() -> Dict[str, Any]:
|
|
| 40 |
"FastLanguageModel": FastLanguageModel,
|
| 41 |
"AutoTokenizer": AutoTokenizer,
|
| 42 |
}
|
| 43 |
-
except
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Fallback: pure HF + PEFT (CPU / MPS friendly)
|
| 45 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 46 |
from peft import get_peft_model, LoraConfig
|
|
@@ -161,10 +169,18 @@ def main():
|
|
| 161 |
tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True, trust_remote_code=True)
|
| 162 |
# Prefer MPS on Apple Silicon if available
|
| 163 |
use_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
model = AutoModelForCausalLM.from_pretrained(
|
| 166 |
args.model_id,
|
| 167 |
-
torch_dtype=
|
| 168 |
trust_remote_code=True,
|
| 169 |
)
|
| 170 |
if use_mps:
|
|
@@ -190,17 +206,25 @@ def main():
|
|
| 190 |
response_field = args.response_field
|
| 191 |
|
| 192 |
if text_field:
|
| 193 |
-
# Simple SFT: single text field
|
| 194 |
-
def format_row(ex):
|
|
|
|
|
|
|
| 195 |
return ex[text_field]
|
| 196 |
elif prompt_field and response_field:
|
| 197 |
-
# Chat data: prompt + response
|
| 198 |
-
def format_row(ex):
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
else:
|
| 201 |
raise ValueError("Provide either --text-field or both --prompt-field and --response-field")
|
| 202 |
|
| 203 |
-
def map_fn(ex):
|
| 204 |
return {"text": format_row(ex)}
|
| 205 |
|
| 206 |
ds = ds.map(map_fn, remove_columns=[c for c in ds.column_names if c != "text"])
|
|
@@ -237,13 +261,16 @@ def main():
|
|
| 237 |
adapter_path.mkdir(parents=True, exist_ok=True)
|
| 238 |
# Save adapter-only weights if PEFT; Unsloth path is also PEFT-compatible
|
| 239 |
try:
|
|
|
|
| 240 |
model.save_pretrained(str(adapter_path))
|
| 241 |
-
except Exception:
|
| 242 |
-
|
| 243 |
try:
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
| 247 |
tokenizer.save_pretrained(str(adapter_path))
|
| 248 |
|
| 249 |
# Write done file
|
|
|
|
| 12 |
import time
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Any, Dict
|
| 15 |
+
import logging
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
# Lazy imports to keep API light
|
| 20 |
|
|
|
|
| 43 |
"FastLanguageModel": FastLanguageModel,
|
| 44 |
"AutoTokenizer": AutoTokenizer,
|
| 45 |
}
|
| 46 |
+
except ImportError as e:
|
| 47 |
+
logger.warning(
|
| 48 |
+
"Primary Unsloth import failed, falling back to HF+PEFT: %s",
|
| 49 |
+
e,
|
| 50 |
+
exc_info=True,
|
| 51 |
+
)
|
| 52 |
# Fallback: pure HF + PEFT (CPU / MPS friendly)
|
| 53 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 54 |
from peft import get_peft_model, LoraConfig
|
|
|
|
| 169 |
tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True, trust_remote_code=True)
|
| 170 |
# Prefer MPS on Apple Silicon if available
|
| 171 |
use_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
| 172 |
+
if not use_mps:
|
| 173 |
+
if args.use_fp16:
|
| 174 |
+
dtype = torch.float16
|
| 175 |
+
elif args.use_bf16:
|
| 176 |
+
dtype = torch.bfloat16
|
| 177 |
+
else:
|
| 178 |
+
dtype = torch.float32
|
| 179 |
+
else:
|
| 180 |
+
dtype = torch.float32
|
| 181 |
model = AutoModelForCausalLM.from_pretrained(
|
| 182 |
args.model_id,
|
| 183 |
+
torch_dtype=dtype,
|
| 184 |
trust_remote_code=True,
|
| 185 |
)
|
| 186 |
if use_mps:
|
|
|
|
| 206 |
response_field = args.response_field
|
| 207 |
|
| 208 |
if text_field:
|
| 209 |
+
# Simple SFT: single text field with validation
|
| 210 |
+
def format_row(ex: Dict[str, Any]) -> str:
|
| 211 |
+
if text_field not in ex:
|
| 212 |
+
raise KeyError(f"Missing required text field '{text_field}' in example: {ex}")
|
| 213 |
return ex[text_field]
|
| 214 |
elif prompt_field and response_field:
|
| 215 |
+
# Chat data: prompt + response with validation
|
| 216 |
+
def format_row(ex: Dict[str, Any]) -> str:
|
| 217 |
+
missing = [f for f in (prompt_field, response_field) if f not in ex]
|
| 218 |
+
if missing:
|
| 219 |
+
raise KeyError(f"Missing required field(s) {missing} in example: {ex}")
|
| 220 |
+
return (
|
| 221 |
+
f"<start_of_turn>user\n{ex[prompt_field]}<end_of_turn>\n"
|
| 222 |
+
f"<start_of_turn>model\n{ex[response_field]}<end_of_turn>\n"
|
| 223 |
+
)
|
| 224 |
else:
|
| 225 |
raise ValueError("Provide either --text-field or both --prompt-field and --response-field")
|
| 226 |
|
| 227 |
+
def map_fn(ex: Dict[str, Any]) -> Dict[str, str]:
|
| 228 |
return {"text": format_row(ex)}
|
| 229 |
|
| 230 |
ds = ds.map(map_fn, remove_columns=[c for c in ds.column_names if c != "text"])
|
|
|
|
| 261 |
adapter_path.mkdir(parents=True, exist_ok=True)
|
| 262 |
# Save adapter-only weights if PEFT; Unsloth path is also PEFT-compatible
|
| 263 |
try:
|
| 264 |
+
# Primary model saving logic
|
| 265 |
model.save_pretrained(str(adapter_path))
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.error("Error during primary model saving: %s", e, exc_info=True) # type: ignore
|
| 268 |
try:
|
| 269 |
+
# Fallback model saving logic
|
| 270 |
+
model.base_model.save_pretrained(str(adapter_path)) # type: ignore[attr-defined]
|
| 271 |
+
except Exception as fallback_e:
|
| 272 |
+
logger.error("Fallback model saving failed: %s", fallback_e, exc_info=True) # type: ignore
|
| 273 |
+
pass # Optionally re-raise or handle accordingly
|
| 274 |
tokenizer.save_pretrained(str(adapter_path))
|
| 275 |
|
| 276 |
# Write done file
|
training_runs/devlocal/meta.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"job_id": "devlocal",
|
| 3 |
"model_id": "unsloth/gemma-3n-E4B-it",
|
| 4 |
-
"dataset": "
|
| 5 |
"created_at": 1754620844
|
| 6 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
"job_id": "devlocal",
|
| 3 |
"model_id": "unsloth/gemma-3n-E4B-it",
|
| 4 |
+
"dataset": "sample_data/train.jsonl",
|
| 5 |
"created_at": 1754620844
|
| 6 |
+
}
|