Tonic commited on
Commit
190d843
·
1 Parent(s): d455d12

adds imports and forces monitoring

Browse files
scripts/training/train_gpt_oss.py CHANGED
@@ -32,6 +32,10 @@ if str(project_root) not in sys.path:
32
  config_dir = project_root / "config"
33
  if str(config_dir) not in sys.path:
34
  sys.path.insert(0, str(config_dir))
 
 
 
 
35
 
36
  # Reduce tokenizer thread contention and improve CUDA allocator behavior
37
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
@@ -945,25 +949,55 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
945
  # Split into train/eval/test
946
  train_dataset, eval_dataset, test_dataset = split_dataset(dataset, config)
947
 
948
- # Setup Trackio tracking
 
 
 
 
 
949
  trackio_client = setup_trackio_tracking(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
 
951
  # Initialize project monitor (HF Datasets + Trackio Space if configured)
952
- monitor = None
953
  monitor_callback = None
954
  if create_monitor_from_config is not None:
955
  try:
956
- monitor = create_monitor_from_config(config, experiment_name=experiment_name)
957
  # Persist configuration immediately
958
  try:
959
  cfg_dict = {k: v for k, v in config.__dict__.items() if not k.startswith('_')}
960
- monitor.log_config(cfg_dict)
961
  except Exception:
962
  pass
963
  # Create callback for SFTTrainer
964
- monitor_callback = monitor.create_monitoring_callback()
 
 
 
965
  except Exception:
966
- monitor = None
967
 
968
  # Create SFT configuration
969
  sft_config = create_sft_config(config, output_dir)
@@ -1042,6 +1076,14 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
1042
  if "callbacks" in sft_params:
1043
  sft_kwargs["callbacks"] = ([monitor_callback] if monitor_callback is not None else [])
1044
 
 
 
 
 
 
 
 
 
1045
  # Remove any None values
1046
  sft_kwargs = {k: v for k, v in sft_kwargs.items() if v is not None}
1047
 
@@ -1083,6 +1125,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
1083
  except Exception:
1084
  pass
1085
 
 
 
 
 
 
 
 
1086
  print("GPT-OSS training completed successfully!")
1087
 
1088
  return trainer
 
32
  config_dir = project_root / "config"
33
  if str(config_dir) not in sys.path:
34
  sys.path.insert(0, str(config_dir))
35
+ # Ensure 'src' is importable for modules like 'monitoring', 'model', etc.
36
+ src_dir = project_root / "src"
37
+ if str(src_dir) not in sys.path:
38
+ sys.path.insert(0, str(src_dir))
39
 
40
  # Reduce tokenizer thread contention and improve CUDA allocator behavior
41
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 
949
  # Split into train/eval/test
950
  train_dataset, eval_dataset, test_dataset = split_dataset(dataset, config)
951
 
952
+ # Ensure TRACKIO_URL env is set so SmolLM3Monitor picks it up
953
+ if trackio_url and not os.environ.get('TRACKIO_URL'):
954
+ os.environ['TRACKIO_URL'] = trackio_url
955
+ os.environ.setdefault('TRACKIO_SPACE_ID', trackio_url)
956
+
957
+ # Setup Trackio tracking (Space API client) and monitoring (dataset + Space)
958
  trackio_client = setup_trackio_tracking(config)
959
+ # Create unified monitor to ensure metrics get logged to dataset/Space
960
+ monitor = None
961
+ try:
962
+ from monitoring import SmolLM3Monitor
963
+ monitor = SmolLM3Monitor(
964
+ experiment_name=experiment_name,
965
+ trackio_url=trackio_url,
966
+ trackio_token=getattr(config, 'trackio_token', None) or os.environ.get('HF_TOKEN'),
967
+ enable_tracking=True,
968
+ log_artifacts=True,
969
+ log_metrics=True,
970
+ log_config=True,
971
+ hf_token=os.environ.get('HF_TOKEN'),
972
+ dataset_repo=os.environ.get('TRACKIO_DATASET_REPO')
973
+ )
974
+ # Log configuration once
975
+ try:
976
+ cfg_dict = {k: getattr(config, k) for k in dir(config) if not k.startswith('_') and not callable(getattr(config, k))}
977
+ monitor.log_configuration(cfg_dict)
978
+ except Exception:
979
+ pass
980
+ except Exception as e:
981
+ print(f"Warning: failed to initialize monitor: {e}")
982
 
983
  # Initialize project monitor (HF Datasets + Trackio Space if configured)
 
984
  monitor_callback = None
985
  if create_monitor_from_config is not None:
986
  try:
987
+ project_monitor = create_monitor_from_config(config, experiment_name=experiment_name)
988
  # Persist configuration immediately
989
  try:
990
  cfg_dict = {k: v for k, v in config.__dict__.items() if not k.startswith('_')}
991
+ project_monitor.log_config(cfg_dict)
992
  except Exception:
993
  pass
994
  # Create callback for SFTTrainer
995
+ monitor_callback = project_monitor.create_monitoring_callback()
996
+ # If we didn't initialize the explicit monitor above, use this one for summary/close
997
+ if monitor is None:
998
+ monitor = project_monitor
999
  except Exception:
1000
+ pass
1001
 
1002
  # Create SFT configuration
1003
  sft_config = create_sft_config(config, output_dir)
 
1076
  if "callbacks" in sft_params:
1077
  sft_kwargs["callbacks"] = ([monitor_callback] if monitor_callback is not None else [])
1078
 
1079
+ # Attach monitoring callback if supported
1080
+ if monitor is not None:
1081
+ try:
1082
+ if "callbacks" in sft_params:
1083
+ sft_kwargs["callbacks"] = [monitor.create_monitoring_callback()]
1084
+ except Exception:
1085
+ pass
1086
+
1087
  # Remove any None values
1088
  sft_kwargs = {k: v for k, v in sft_kwargs.items() if v is not None}
1089
 
 
1125
  except Exception:
1126
  pass
1127
 
1128
+ # Close monitor cleanly
1129
+ try:
1130
+ if monitor is not None:
1131
+ monitor.close()
1132
+ except Exception:
1133
+ pass
1134
+
1135
  print("GPT-OSS training completed successfully!")
1136
 
1137
  return trainer
src/monitoring.py CHANGED
@@ -347,7 +347,7 @@ class SmolLM3Monitor:
347
  return False
348
 
349
  def log_configuration(self, config: Dict[str, Any]):
350
- """Log experiment configuration"""
351
  if not self.log_config_enabled:
352
  return
353
 
@@ -424,7 +424,8 @@ class SmolLM3Monitor:
424
  self.metrics_history.append(metrics)
425
 
426
  # Save to HF Dataset periodically (configurable)
427
- if self.flush_interval > 0 and (len(self.metrics_history) % self.flush_interval == 0):
 
428
  self._save_to_hf_dataset({'metrics': self.metrics_history})
429
 
430
  logger.debug("Metrics logged: %s", metrics)
 
347
  return False
348
 
349
  def log_configuration(self, config: Dict[str, Any]):
350
+ """Log experiment configuration (always attempts dataset persistence)"""
351
  if not self.log_config_enabled:
352
  return
353
 
 
424
  self.metrics_history.append(metrics)
425
 
426
  # Save to HF Dataset periodically (configurable)
427
+ flush_every = getattr(self, 'flush_interval', 10)
428
+ if flush_every and (len(self.metrics_history) % flush_every == 0):
429
  self._save_to_hf_dataset({'metrics': self.metrics_history})
430
 
431
  logger.debug("Metrics logged: %s", metrics)