Joseph Pollack commited on
Commit
68a76d2
Β·
unverified Β·
1 Parent(s): 68e42bf

adds network timeout wait

Browse files
scripts/__pycache__/deploy_demo_space.cpython-313.pyc ADDED
Binary file (53.2 kB). View file
 
scripts/__pycache__/push_to_huggingface.cpython-313.pyc ADDED
Binary file (45.6 kB). View file
 
scripts/__pycache__/train_lora.cpython-313.pyc ADDED
Binary file (22.6 kB). View file
 
scripts/deploy_demo_space.py CHANGED
@@ -566,7 +566,7 @@ os.environ['BRAND_PROJECT_URL'] = {_json.dumps(self.brand_project_url)}
566
  f"app_file: app.py\n"
567
  f"pinned: false\n"
568
  f"short_description: Interactive demo for {self.model_id}\n"
569
- + ("license: mit\n" if self.demo_type != 'gpt' else "") +
570
  f"---\n\n"
571
  )
572
 
 
566
  f"app_file: app.py\n"
567
  f"pinned: false\n"
568
  f"short_description: Interactive demo for {self.model_id}\n"
569
+ + ("license: mit\\n" if self.demo_type != 'gpt' else "") +
570
  f"---\n\n"
571
  )
572
 
scripts/push_to_huggingface.py CHANGED
@@ -137,54 +137,126 @@ class HuggingFacePusher:
137
 
138
  def _detect_artifact_type(self) -> str:
139
  """Detect whether output dir contains a full model or a LoRA adapter."""
140
- # LoRA artifacts
141
- lora_candidates = [
142
- self.model_path / "adapter_config.json",
143
- self.model_path / "adapter_model.safetensors",
144
- self.model_path / "adapter_model.bin",
145
- ]
146
- if any(p.exists() for p in lora_candidates) and (self.model_path / "adapter_config.json").exists():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  return "lora"
148
-
149
- # Full model artifacts
150
- full_candidates = [
151
- self.model_path / "config.json",
152
- self.model_path / "model.safetensors",
153
- self.model_path / "model.safetensors.index.json",
154
- self.model_path / "pytorch_model.bin",
155
- ]
156
- if any(p.exists() for p in full_candidates):
 
 
 
 
 
 
 
 
 
 
 
 
157
  return "full"
 
 
 
 
158
 
 
159
  return "unknown"
160
 
161
  def validate_model_path(self) -> bool:
162
  """Validate that the model path contains required files for Voxtral full or LoRA."""
163
  self.artifact_type = self._detect_artifact_type()
 
 
 
 
 
 
 
 
164
  if self.artifact_type == "lora":
165
- required = [self.model_path / "adapter_config.json"]
166
- if not all(p.exists() for p in required):
167
- logger.error("❌ LoRA artifacts missing required files (adapter_config.json)")
 
 
 
 
168
  return False
169
- # At least one adapter weight
170
- if not ((self.model_path / "adapter_model.safetensors").exists() or (self.model_path / "adapter_model.bin").exists()):
171
- logger.error("❌ LoRA artifacts missing adapter weights (adapter_model.safetensors or adapter_model.bin)")
172
  return False
173
- logger.info("βœ… Detected LoRA adapter artifacts")
 
 
 
 
 
 
 
174
  return True
175
 
176
  if self.artifact_type == "full":
177
- # Relaxed set: require config.json and at least one model weights file
178
- if not (self.model_path / "config.json").exists():
179
- logger.error("❌ Missing config.json in model directory")
 
 
 
 
 
180
  return False
181
- if not ((self.model_path / "model.safetensors").exists() or (self.model_path / "model.safetensors.index.json").exists() or (self.model_path / "pytorch_model.bin").exists()):
182
- logger.error("❌ Missing model weights file (model.safetensors or pytorch_model.bin)")
 
183
  return False
184
- logger.info("βœ… Detected full model artifacts")
 
 
 
 
 
 
 
 
 
185
  return True
186
 
187
- logger.error("❌ Could not detect model artifacts (neither full model nor LoRA)")
188
  return False
189
 
190
  def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
@@ -455,9 +527,16 @@ MIT License
455
  results: Optional[Dict[str, Any]] = None) -> bool:
456
  """Complete model push process"""
457
  logger.info(f"πŸš€ Starting model push to {self.repo_id}")
 
 
458
 
459
  # Validate model path
460
  if not self.validate_model_path():
 
 
 
 
 
461
  return False
462
 
463
  # Create repository
 
137
 
138
  def _detect_artifact_type(self) -> str:
139
  """Detect whether output dir contains a full model or a LoRA adapter."""
140
+ logger.info(f"Detecting model artifacts in: {self.model_path}")
141
+
142
+ # Check if path exists
143
+ if not self.model_path.exists():
144
+ logger.error(f"❌ Model path does not exist: {self.model_path}")
145
+ return "unknown"
146
+
147
+ # List all files for debugging
148
+ all_files = list(self.model_path.rglob("*"))
149
+ logger.info(f"πŸ“ Found {len(all_files)} files in model directory")
150
+ if len(all_files) <= 20: # Only show if not too many files
151
+ for f in all_files:
152
+ logger.info(f" - {f.relative_to(self.model_path)}")
153
+
154
+ # LoRA artifacts - be more flexible about file combinations
155
+ lora_config = self.model_path / "adapter_config.json"
156
+ lora_weights_safetensors = self.model_path / "adapter_model.safetensors"
157
+ lora_weights_bin = self.model_path / "adapter_model.bin"
158
+
159
+ has_lora_config = lora_config.exists()
160
+ has_lora_weights = lora_weights_safetensors.exists() or lora_weights_bin.exists()
161
+
162
+ if has_lora_config:
163
+ logger.info("βœ… Found adapter_config.json")
164
+ if has_lora_weights:
165
+ logger.info("βœ… Found LoRA weight files")
166
+
167
+ if has_lora_config and has_lora_weights:
168
+ logger.info("🎯 Detected LoRA adapter artifacts")
169
  return "lora"
170
+ elif has_lora_config:
171
+ logger.warning("⚠️ Found adapter_config.json but no weight files")
172
+ elif has_lora_weights:
173
+ logger.warning("⚠️ Found LoRA weight files but no adapter_config.json")
174
+
175
+ # Full model artifacts - also be more flexible
176
+ config_file = self.model_path / "config.json"
177
+ safetensors_model = self.model_path / "model.safetensors"
178
+ safetensors_index = self.model_path / "model.safetensors.index.json"
179
+ pytorch_model = self.model_path / "pytorch_model.bin"
180
+
181
+ has_config = config_file.exists()
182
+ has_weights = (safetensors_model.exists() or safetensors_index.exists() or pytorch_model.exists())
183
+
184
+ if has_config:
185
+ logger.info("βœ… Found config.json")
186
+ if has_weights:
187
+ logger.info("βœ… Found model weight files")
188
+
189
+ if has_config and has_weights:
190
+ logger.info("🎯 Detected full model artifacts")
191
  return "full"
192
+ elif has_config:
193
+ logger.warning("⚠️ Found config.json but no weight files")
194
+ elif has_weights:
195
+ logger.warning("⚠️ Found weight files but no config.json")
196
 
197
+ logger.error("❌ Could not detect model artifacts (neither full model nor LoRA)")
198
  return "unknown"
199
 
200
  def validate_model_path(self) -> bool:
201
  """Validate that the model path contains required files for Voxtral full or LoRA."""
202
  self.artifact_type = self._detect_artifact_type()
203
+
204
+ if self.artifact_type == "unknown":
205
+ logger.error("❌ Could not detect model type. Expected files:")
206
+ logger.error(" For LoRA: adapter_config.json + adapter_model.safetensors (or .bin)")
207
+ logger.error(" For Full Model: config.json + model.safetensors (or pytorch_model.bin)")
208
+ logger.error(" For Voxtral ASR: also look for processor_config.json, tokenizer.json, etc.")
209
+ return False
210
+
211
  if self.artifact_type == "lora":
212
+ # Check for required LoRA files
213
+ config_file = self.model_path / "adapter_config.json"
214
+ weights_file_safetensors = self.model_path / "adapter_model.safetensors"
215
+ weights_file_bin = self.model_path / "adapter_model.bin"
216
+
217
+ if not config_file.exists():
218
+ logger.error("❌ LoRA adapter missing required file: adapter_config.json")
219
  return False
220
+
221
+ if not (weights_file_safetensors.exists() or weights_file_bin.exists()):
222
+ logger.error("❌ LoRA adapter missing weight files: adapter_model.safetensors or adapter_model.bin")
223
  return False
224
+
225
+ logger.info("βœ… LoRA adapter validation successful")
226
+ logger.info(f" - Config: {config_file.name}")
227
+ if weights_file_safetensors.exists():
228
+ logger.info(f" - Weights: {weights_file_safetensors.name}")
229
+ elif weights_file_bin.exists():
230
+ logger.info(f" - Weights: {weights_file_bin.name}")
231
+
232
  return True
233
 
234
  if self.artifact_type == "full":
235
+ # Check for required full model files
236
+ config_file = self.model_path / "config.json"
237
+ safetensors_file = self.model_path / "model.safetensors"
238
+ safetensors_index = self.model_path / "model.safetensors.index.json"
239
+ pytorch_file = self.model_path / "pytorch_model.bin"
240
+
241
+ if not config_file.exists():
242
+ logger.error("❌ Full model missing required file: config.json")
243
  return False
244
+
245
+ if not (safetensors_file.exists() or safetensors_index.exists() or pytorch_file.exists()):
246
+ logger.error("❌ Full model missing weight files: model.safetensors, model.safetensors.index.json, or pytorch_model.bin")
247
  return False
248
+
249
+ logger.info("βœ… Full model validation successful")
250
+ logger.info(f" - Config: {config_file.name}")
251
+ if safetensors_file.exists():
252
+ logger.info(f" - Weights: {safetensors_file.name}")
253
+ elif safetensors_index.exists():
254
+ logger.info(f" - Weights: {safetensors_index.name} (sharded)")
255
+ elif pytorch_file.exists():
256
+ logger.info(f" - Weights: {pytorch_file.name}")
257
+
258
  return True
259
 
 
260
  return False
261
 
262
  def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
 
527
  results: Optional[Dict[str, Any]] = None) -> bool:
528
  """Complete model push process"""
529
  logger.info(f"πŸš€ Starting model push to {self.repo_id}")
530
+ logger.info(f"πŸ“‚ Model path: {self.model_path}")
531
+ logger.info(f"🎯 Repository: {self.repo_id}")
532
 
533
  # Validate model path
534
  if not self.validate_model_path():
535
+ logger.error("❌ Model validation failed. Please check:")
536
+ logger.error(" 1. The model path exists and contains the expected files")
537
+ logger.error(" 2. For LoRA models: adapter_config.json and adapter_model.* files")
538
+ logger.error(" 3. For full models: config.json and model weight files")
539
+ logger.error(" 4. Make sure the training completed successfully and saved the model")
540
  return False
541
 
542
  # Create repository
scripts/train_lora.py CHANGED
@@ -24,6 +24,7 @@ Get your token from: https://huggingface.co/settings/tokens
24
 
25
  import argparse
26
  import json
 
27
  from pathlib import Path
28
  from datetime import datetime
29
  from typing import Tuple, Optional
@@ -285,50 +286,117 @@ def main():
285
  if not trackio_space:
286
  trackio_space = get_default_space_name("voxtral-lora-finetuning")
287
 
288
- # Initialize trackio for experiment tracking
 
289
  if trackio_space:
290
  print(f"Initializing trackio with space: {trackio_space}")
291
- trackio.init(
292
- project="voxtral-lora-finetuning",
293
- config={
294
- "model_checkpoint": model_checkpoint,
295
- "output_dir": output_dir,
296
- "batch_size": args.batch_size,
297
- "learning_rate": args.learning_rate,
298
- "epochs": args.epochs,
299
- "train_count": args.train_count,
300
- "eval_count": args.eval_count,
301
- "dataset_jsonl": args.dataset_jsonl,
302
- "dataset_name": args.dataset_name,
303
- "dataset_config": args.dataset_config,
304
- "lora_r": args.lora_r,
305
- "lora_alpha": args.lora_alpha,
306
- "lora_dropout": args.lora_dropout,
307
- "freeze_audio_tower": args.freeze_audio_tower,
308
- },
309
- space_id=trackio_space
310
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  else:
312
  print("Initializing trackio in local-only mode")
313
- trackio.init(
314
- project="voxtral-lora-finetuning",
315
- config={
316
- "model_checkpoint": model_checkpoint,
317
- "output_dir": output_dir,
318
- "batch_size": args.batch_size,
319
- "learning_rate": args.learning_rate,
320
- "epochs": args.epochs,
321
- "train_count": args.train_count,
322
- "eval_count": args.eval_count,
323
- "dataset_jsonl": args.dataset_jsonl,
324
- "dataset_name": args.dataset_name,
325
- "dataset_config": args.dataset_config,
326
- "lora_r": args.lora_r,
327
- "lora_alpha": args.lora_alpha,
328
- "lora_dropout": args.lora_dropout,
329
- "freeze_audio_tower": args.freeze_audio_tower,
330
- }
331
- )
 
 
 
 
 
 
332
 
333
  print("Loading processor and model...")
334
  processor = VoxtralProcessor.from_pretrained(model_checkpoint)
@@ -397,8 +465,9 @@ def main():
397
  if eval_dataset:
398
  results = trainer.evaluate()
399
  print(f"Final evaluation results: {results}")
400
- # Log final evaluation results
401
- trackio.log(results)
 
402
 
403
  # Push dataset to Hub if requested
404
  if args.push_dataset and args.dataset_jsonl:
@@ -433,8 +502,9 @@ def main():
433
  except Exception as e:
434
  print(f"❌ Error pushing dataset: {e}")
435
 
436
- # Finish trackio logging
437
- trackio.finish()
 
438
 
439
  print("Training completed successfully!")
440
 
 
24
 
25
  import argparse
26
  import json
27
+ import time
28
  from pathlib import Path
29
  from datetime import datetime
30
  from typing import Tuple, Optional
 
286
  if not trackio_space:
287
  trackio_space = get_default_space_name("voxtral-lora-finetuning")
288
 
289
+ # Initialize trackio for experiment tracking with retry logic
290
+ trackio_enabled = False
291
  if trackio_space:
292
  print(f"Initializing trackio with space: {trackio_space}")
293
+ try:
294
+ trackio.init(
295
+ project="voxtral-lora-finetuning",
296
+ config={
297
+ "model_checkpoint": model_checkpoint,
298
+ "output_dir": output_dir,
299
+ "batch_size": args.batch_size,
300
+ "learning_rate": args.learning_rate,
301
+ "epochs": args.epochs,
302
+ "train_count": args.train_count,
303
+ "eval_count": args.eval_count,
304
+ "dataset_jsonl": args.dataset_jsonl,
305
+ "dataset_name": args.dataset_name,
306
+ "dataset_config": args.dataset_config,
307
+ "lora_r": args.lora_r,
308
+ "lora_alpha": args.lora_alpha,
309
+ "lora_dropout": args.lora_dropout,
310
+ "freeze_audio_tower": args.freeze_audio_tower,
311
+ },
312
+ space_id=trackio_space
313
+ )
314
+ trackio_enabled = True
315
+ print("βœ… Trackio initialized successfully")
316
+ except Exception as e:
317
+ print(f"❌ Failed to initialize trackio with space: {e}")
318
+ print("⏳ Waiting 3 minutes for space to deploy before retrying...")
319
+ time.sleep(180) # Wait 3 minutes (180 seconds)
320
+
321
+ print("πŸ”„ Retrying trackio initialization with space...")
322
+ try:
323
+ trackio.init(
324
+ project="voxtral-lora-finetuning",
325
+ config={
326
+ "model_checkpoint": model_checkpoint,
327
+ "output_dir": output_dir,
328
+ "batch_size": args.batch_size,
329
+ "learning_rate": args.learning_rate,
330
+ "epochs": args.epochs,
331
+ "train_count": args.train_count,
332
+ "eval_count": args.eval_count,
333
+ "dataset_jsonl": args.dataset_jsonl,
334
+ "dataset_name": args.dataset_name,
335
+ "dataset_config": args.dataset_config,
336
+ "lora_r": args.lora_r,
337
+ "lora_alpha": args.lora_alpha,
338
+ "lora_dropout": args.lora_dropout,
339
+ "freeze_audio_tower": args.freeze_audio_tower,
340
+ },
341
+ space_id=trackio_space
342
+ )
343
+ trackio_enabled = True
344
+ print("βœ… Trackio initialized successfully after retry")
345
+ except Exception as retry_e:
346
+ print(f"❌ Retry also failed: {retry_e}")
347
+ print("πŸ”„ Falling back to local-only mode...")
348
+ try:
349
+ trackio.init(
350
+ project="voxtral-lora-finetuning",
351
+ config={
352
+ "model_checkpoint": model_checkpoint,
353
+ "output_dir": output_dir,
354
+ "batch_size": args.batch_size,
355
+ "learning_rate": args.learning_rate,
356
+ "epochs": args.epochs,
357
+ "train_count": args.train_count,
358
+ "eval_count": args.eval_count,
359
+ "dataset_jsonl": args.dataset_jsonl,
360
+ "dataset_name": args.dataset_name,
361
+ "dataset_config": args.dataset_config,
362
+ "lora_r": args.lora_r,
363
+ "lora_alpha": args.lora_alpha,
364
+ "lora_dropout": args.lora_dropout,
365
+ "freeze_audio_tower": args.freeze_audio_tower,
366
+ }
367
+ )
368
+ trackio_enabled = True
369
+ print("βœ… Trackio initialized in local-only mode")
370
+ except Exception as fallback_e:
371
+ print(f"❌ Failed to initialize trackio in local mode: {fallback_e}")
372
+ print("⚠️ Training will continue without experiment tracking")
373
  else:
374
  print("Initializing trackio in local-only mode")
375
+ try:
376
+ trackio.init(
377
+ project="voxtral-lora-finetuning",
378
+ config={
379
+ "model_checkpoint": model_checkpoint,
380
+ "output_dir": output_dir,
381
+ "batch_size": args.batch_size,
382
+ "learning_rate": args.learning_rate,
383
+ "epochs": args.epochs,
384
+ "train_count": args.train_count,
385
+ "eval_count": args.eval_count,
386
+ "dataset_jsonl": args.dataset_jsonl,
387
+ "dataset_name": args.dataset_name,
388
+ "dataset_config": args.dataset_config,
389
+ "lora_r": args.lora_r,
390
+ "lora_alpha": args.lora_alpha,
391
+ "lora_dropout": args.lora_dropout,
392
+ "freeze_audio_tower": args.freeze_audio_tower,
393
+ }
394
+ )
395
+ trackio_enabled = True
396
+ print("βœ… Trackio initialized in local-only mode")
397
+ except Exception as e:
398
+ print(f"❌ Failed to initialize trackio: {e}")
399
+ print("⚠️ Training will continue without experiment tracking")
400
 
401
  print("Loading processor and model...")
402
  processor = VoxtralProcessor.from_pretrained(model_checkpoint)
 
465
  if eval_dataset:
466
  results = trainer.evaluate()
467
  print(f"Final evaluation results: {results}")
468
+ # Log final evaluation results if trackio is enabled
469
+ if trackio_enabled:
470
+ trackio.log(results)
471
 
472
  # Push dataset to Hub if requested
473
  if args.push_dataset and args.dataset_jsonl:
 
502
  except Exception as e:
503
  print(f"❌ Error pushing dataset: {e}")
504
 
505
+ # Finish trackio logging if enabled
506
+ if trackio_enabled:
507
+ trackio.finish()
508
 
509
  print("Training completed successfully!")
510