Spaces:
Running
Running
adds imports and forces monitoring
Browse files- scripts/training/train_gpt_oss.py +55 -6
- src/monitoring.py +3 -2
scripts/training/train_gpt_oss.py
CHANGED
|
@@ -32,6 +32,10 @@ if str(project_root) not in sys.path:
|
|
| 32 |
config_dir = project_root / "config"
|
| 33 |
if str(config_dir) not in sys.path:
|
| 34 |
sys.path.insert(0, str(config_dir))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Reduce tokenizer thread contention and improve CUDA allocator behavior
|
| 37 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
@@ -945,25 +949,55 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
| 945 |
# Split into train/eval/test
|
| 946 |
train_dataset, eval_dataset, test_dataset = split_dataset(dataset, config)
|
| 947 |
|
| 948 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
trackio_client = setup_trackio_tracking(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 950 |
|
| 951 |
# Initialize project monitor (HF Datasets + Trackio Space if configured)
|
| 952 |
-
monitor = None
|
| 953 |
monitor_callback = None
|
| 954 |
if create_monitor_from_config is not None:
|
| 955 |
try:
|
| 956 |
-
|
| 957 |
# Persist configuration immediately
|
| 958 |
try:
|
| 959 |
cfg_dict = {k: v for k, v in config.__dict__.items() if not k.startswith('_')}
|
| 960 |
-
|
| 961 |
except Exception:
|
| 962 |
pass
|
| 963 |
# Create callback for SFTTrainer
|
| 964 |
-
monitor_callback =
|
|
|
|
|
|
|
|
|
|
| 965 |
except Exception:
|
| 966 |
-
|
| 967 |
|
| 968 |
# Create SFT configuration
|
| 969 |
sft_config = create_sft_config(config, output_dir)
|
|
@@ -1042,6 +1076,14 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
| 1042 |
if "callbacks" in sft_params:
|
| 1043 |
sft_kwargs["callbacks"] = ([monitor_callback] if monitor_callback is not None else [])
|
| 1044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
# Remove any None values
|
| 1046 |
sft_kwargs = {k: v for k, v in sft_kwargs.items() if v is not None}
|
| 1047 |
|
|
@@ -1083,6 +1125,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
| 1083 |
except Exception:
|
| 1084 |
pass
|
| 1085 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1086 |
print("GPT-OSS training completed successfully!")
|
| 1087 |
|
| 1088 |
return trainer
|
|
|
|
| 32 |
config_dir = project_root / "config"
|
| 33 |
if str(config_dir) not in sys.path:
|
| 34 |
sys.path.insert(0, str(config_dir))
|
| 35 |
+
# Ensure 'src' is importable for modules like 'monitoring', 'model', etc.
|
| 36 |
+
src_dir = project_root / "src"
|
| 37 |
+
if str(src_dir) not in sys.path:
|
| 38 |
+
sys.path.insert(0, str(src_dir))
|
| 39 |
|
| 40 |
# Reduce tokenizer thread contention and improve CUDA allocator behavior
|
| 41 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
|
|
| 949 |
# Split into train/eval/test
|
| 950 |
train_dataset, eval_dataset, test_dataset = split_dataset(dataset, config)
|
| 951 |
|
| 952 |
+
# Ensure TRACKIO_URL env is set so SmolLM3Monitor picks it up
|
| 953 |
+
if trackio_url and not os.environ.get('TRACKIO_URL'):
|
| 954 |
+
os.environ['TRACKIO_URL'] = trackio_url
|
| 955 |
+
os.environ.setdefault('TRACKIO_SPACE_ID', trackio_url)
|
| 956 |
+
|
| 957 |
+
# Setup Trackio tracking (Space API client) and monitoring (dataset + Space)
|
| 958 |
trackio_client = setup_trackio_tracking(config)
|
| 959 |
+
# Create unified monitor to ensure metrics get logged to dataset/Space
|
| 960 |
+
monitor = None
|
| 961 |
+
try:
|
| 962 |
+
from monitoring import SmolLM3Monitor
|
| 963 |
+
monitor = SmolLM3Monitor(
|
| 964 |
+
experiment_name=experiment_name,
|
| 965 |
+
trackio_url=trackio_url,
|
| 966 |
+
trackio_token=getattr(config, 'trackio_token', None) or os.environ.get('HF_TOKEN'),
|
| 967 |
+
enable_tracking=True,
|
| 968 |
+
log_artifacts=True,
|
| 969 |
+
log_metrics=True,
|
| 970 |
+
log_config=True,
|
| 971 |
+
hf_token=os.environ.get('HF_TOKEN'),
|
| 972 |
+
dataset_repo=os.environ.get('TRACKIO_DATASET_REPO')
|
| 973 |
+
)
|
| 974 |
+
# Log configuration once
|
| 975 |
+
try:
|
| 976 |
+
cfg_dict = {k: getattr(config, k) for k in dir(config) if not k.startswith('_') and not callable(getattr(config, k))}
|
| 977 |
+
monitor.log_configuration(cfg_dict)
|
| 978 |
+
except Exception:
|
| 979 |
+
pass
|
| 980 |
+
except Exception as e:
|
| 981 |
+
print(f"Warning: failed to initialize monitor: {e}")
|
| 982 |
|
| 983 |
# Initialize project monitor (HF Datasets + Trackio Space if configured)
|
|
|
|
| 984 |
monitor_callback = None
|
| 985 |
if create_monitor_from_config is not None:
|
| 986 |
try:
|
| 987 |
+
project_monitor = create_monitor_from_config(config, experiment_name=experiment_name)
|
| 988 |
# Persist configuration immediately
|
| 989 |
try:
|
| 990 |
cfg_dict = {k: v for k, v in config.__dict__.items() if not k.startswith('_')}
|
| 991 |
+
project_monitor.log_config(cfg_dict)
|
| 992 |
except Exception:
|
| 993 |
pass
|
| 994 |
# Create callback for SFTTrainer
|
| 995 |
+
monitor_callback = project_monitor.create_monitoring_callback()
|
| 996 |
+
# If we didn't initialize the explicit monitor above, use this one for summary/close
|
| 997 |
+
if monitor is None:
|
| 998 |
+
monitor = project_monitor
|
| 999 |
except Exception:
|
| 1000 |
+
pass
|
| 1001 |
|
| 1002 |
# Create SFT configuration
|
| 1003 |
sft_config = create_sft_config(config, output_dir)
|
|
|
|
| 1076 |
if "callbacks" in sft_params:
|
| 1077 |
sft_kwargs["callbacks"] = ([monitor_callback] if monitor_callback is not None else [])
|
| 1078 |
|
| 1079 |
+
# Attach monitoring callback if supported
|
| 1080 |
+
if monitor is not None:
|
| 1081 |
+
try:
|
| 1082 |
+
if "callbacks" in sft_params:
|
| 1083 |
+
sft_kwargs["callbacks"] = [monitor.create_monitoring_callback()]
|
| 1084 |
+
except Exception:
|
| 1085 |
+
pass
|
| 1086 |
+
|
| 1087 |
# Remove any None values
|
| 1088 |
sft_kwargs = {k: v for k, v in sft_kwargs.items() if v is not None}
|
| 1089 |
|
|
|
|
| 1125 |
except Exception:
|
| 1126 |
pass
|
| 1127 |
|
| 1128 |
+
# Close monitor cleanly
|
| 1129 |
+
try:
|
| 1130 |
+
if monitor is not None:
|
| 1131 |
+
monitor.close()
|
| 1132 |
+
except Exception:
|
| 1133 |
+
pass
|
| 1134 |
+
|
| 1135 |
print("GPT-OSS training completed successfully!")
|
| 1136 |
|
| 1137 |
return trainer
|
src/monitoring.py
CHANGED
|
@@ -347,7 +347,7 @@ class SmolLM3Monitor:
|
|
| 347 |
return False
|
| 348 |
|
| 349 |
def log_configuration(self, config: Dict[str, Any]):
|
| 350 |
-
"""Log experiment configuration"""
|
| 351 |
if not self.log_config_enabled:
|
| 352 |
return
|
| 353 |
|
|
@@ -424,7 +424,8 @@ class SmolLM3Monitor:
|
|
| 424 |
self.metrics_history.append(metrics)
|
| 425 |
|
| 426 |
# Save to HF Dataset periodically (configurable)
|
| 427 |
-
|
|
|
|
| 428 |
self._save_to_hf_dataset({'metrics': self.metrics_history})
|
| 429 |
|
| 430 |
logger.debug("Metrics logged: %s", metrics)
|
|
|
|
| 347 |
return False
|
| 348 |
|
| 349 |
def log_configuration(self, config: Dict[str, Any]):
|
| 350 |
+
"""Log experiment configuration (always attempts dataset persistence)"""
|
| 351 |
if not self.log_config_enabled:
|
| 352 |
return
|
| 353 |
|
|
|
|
| 424 |
self.metrics_history.append(metrics)
|
| 425 |
|
| 426 |
# Save to HF Dataset periodically (configurable)
|
| 427 |
+
flush_every = getattr(self, 'flush_interval', 10)
|
| 428 |
+
if flush_every and (len(self.metrics_history) % flush_every == 0):
|
| 429 |
self._save_to_hf_dataset({'metrics': self.metrics_history})
|
| 430 |
|
| 431 |
logger.debug("Metrics logged: %s", metrics)
|