Spaces:
Running
Running
Joseph Pollack
commited on
improves dataset push to huggingface
Browse files- README.md +1 -1
- interface.py +103 -10
- scripts/push_to_huggingface.py +209 -32
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: VoxFactory
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: VoxFactory
|
3 |
+
emoji: π¬οΈ
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
interface.py
CHANGED
@@ -177,11 +177,12 @@ def _save_uploaded_dataset(files: list, transcripts: list[str]) -> str:
|
|
177 |
|
178 |
|
179 |
def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
|
180 |
-
"""Push dataset to Hugging Face Hub"""
|
181 |
try:
|
182 |
from huggingface_hub import HfApi, create_repo
|
183 |
import json
|
184 |
from pathlib import Path
|
|
|
185 |
|
186 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
187 |
|
@@ -210,16 +211,74 @@ def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") ->
|
|
210 |
if not jsonl_file.exists():
|
211 |
return f"β Dataset file not found: {jsonl_path}"
|
212 |
|
213 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
api.upload_file(
|
215 |
-
path_or_fileobj=str(
|
216 |
path_in_repo="data.jsonl",
|
217 |
repo_id=repo_name,
|
218 |
repo_type="dataset",
|
219 |
token=token
|
220 |
)
|
221 |
|
222 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
readme_content = f"""---
|
224 |
dataset_info:
|
225 |
features:
|
@@ -230,9 +289,15 @@ dataset_info:
|
|
230 |
splits:
|
231 |
- name: train
|
232 |
num_bytes: {jsonl_file.stat().st_size}
|
233 |
-
num_examples: {
|
234 |
-
download_size: {
|
235 |
-
dataset_size: {
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
---
|
237 |
|
238 |
# Voxtral ASR Dataset
|
@@ -241,15 +306,43 @@ This dataset was created using the Voxtral ASR Fine-tuning Interface.
|
|
241 |
|
242 |
## Dataset Structure
|
243 |
|
244 |
-
- **audio_path**:
|
245 |
- **text**: Transcription of the audio
|
246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
## Usage
|
248 |
|
249 |
```python
|
250 |
-
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
|
|
252 |
dataset = load_dataset("{repo_name}")
|
|
|
|
|
|
|
|
|
|
|
253 |
```
|
254 |
"""
|
255 |
|
@@ -268,7 +361,7 @@ dataset = load_dataset("{repo_name}")
|
|
268 |
|
269 |
readme_path.unlink() # Clean up temp file
|
270 |
|
271 |
-
return f"β
Dataset pushed to: https://huggingface.co/datasets/{repo_name}"
|
272 |
|
273 |
except Exception as e:
|
274 |
return f"β Failed to push dataset: {e}"
|
|
|
177 |
|
178 |
|
179 |
def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
|
180 |
+
"""Push dataset to Hugging Face Hub including audio files"""
|
181 |
try:
|
182 |
from huggingface_hub import HfApi, create_repo
|
183 |
import json
|
184 |
from pathlib import Path
|
185 |
+
import os
|
186 |
|
187 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
188 |
|
|
|
211 |
if not jsonl_file.exists():
|
212 |
return f"β Dataset file not found: {jsonl_path}"
|
213 |
|
214 |
+
# Read and process the JSONL to collect audio files and update paths
|
215 |
+
audio_files = []
|
216 |
+
updated_rows = []
|
217 |
+
total_audio_size = 0
|
218 |
+
|
219 |
+
with open(jsonl_file, "r", encoding="utf-8") as f:
|
220 |
+
for line_num, line in enumerate(f):
|
221 |
+
try:
|
222 |
+
row = json.loads(line.strip())
|
223 |
+
audio_path = row.get("audio_path", "")
|
224 |
+
|
225 |
+
if audio_path:
|
226 |
+
audio_file = Path(audio_path)
|
227 |
+
if audio_file.exists():
|
228 |
+
# Store the original file for upload
|
229 |
+
audio_files.append(audio_file)
|
230 |
+
total_audio_size += audio_file.stat().st_size
|
231 |
+
|
232 |
+
# Update path to be relative for the dataset
|
233 |
+
row["audio_path"] = f"audio/{audio_file.name}"
|
234 |
+
else:
|
235 |
+
print(f"β οΈ Warning: Audio file not found: {audio_path}")
|
236 |
+
row["audio_path"] = "" # Clear missing files
|
237 |
+
|
238 |
+
updated_rows.append(row)
|
239 |
+
except json.JSONDecodeError as e:
|
240 |
+
print(f"β οΈ Warning: Invalid JSON on line {line_num + 1}: {e}")
|
241 |
+
continue
|
242 |
+
|
243 |
+
# Create updated JSONL with relative paths
|
244 |
+
temp_jsonl_path = jsonl_file.parent / "temp_data.jsonl"
|
245 |
+
with open(temp_jsonl_path, "w", encoding="utf-8") as f:
|
246 |
+
for row in updated_rows:
|
247 |
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
248 |
+
|
249 |
+
# Upload the updated JSONL file
|
250 |
api.upload_file(
|
251 |
+
path_or_fileobj=str(temp_jsonl_path),
|
252 |
path_in_repo="data.jsonl",
|
253 |
repo_id=repo_name,
|
254 |
repo_type="dataset",
|
255 |
token=token
|
256 |
)
|
257 |
|
258 |
+
# Clean up temp file
|
259 |
+
temp_jsonl_path.unlink()
|
260 |
+
|
261 |
+
# Upload audio files
|
262 |
+
uploaded_count = 0
|
263 |
+
for audio_file in audio_files:
|
264 |
+
try:
|
265 |
+
remote_path = f"audio/{audio_file.name}"
|
266 |
+
api.upload_file(
|
267 |
+
path_or_fileobj=str(audio_file),
|
268 |
+
path_in_repo=remote_path,
|
269 |
+
repo_id=repo_name,
|
270 |
+
repo_type="dataset",
|
271 |
+
token=token
|
272 |
+
)
|
273 |
+
uploaded_count += 1
|
274 |
+
print(f"β
Uploaded audio file: {audio_file.name}")
|
275 |
+
except Exception as e:
|
276 |
+
print(f"β Failed to upload {audio_file.name}: {e}")
|
277 |
+
|
278 |
+
# Calculate total dataset size
|
279 |
+
total_dataset_size = jsonl_file.stat().st_size + total_audio_size
|
280 |
+
|
281 |
+
# Create README for the dataset
|
282 |
readme_content = f"""---
|
283 |
dataset_info:
|
284 |
features:
|
|
|
289 |
splits:
|
290 |
- name: train
|
291 |
num_bytes: {jsonl_file.stat().st_size}
|
292 |
+
num_examples: {len(updated_rows)}
|
293 |
+
download_size: {total_dataset_size}
|
294 |
+
dataset_size: {total_dataset_size}
|
295 |
+
tags:
|
296 |
+
- voxtral
|
297 |
+
- asr
|
298 |
+
- speech-to-text
|
299 |
+
- fine-tuning
|
300 |
+
- audio-dataset
|
301 |
---
|
302 |
|
303 |
# Voxtral ASR Dataset
|
|
|
306 |
|
307 |
## Dataset Structure
|
308 |
|
309 |
+
- **audio_path**: Relative path to the audio file (stored in `audio/` directory)
|
310 |
- **text**: Transcription of the audio
|
311 |
|
312 |
+
## Dataset Statistics
|
313 |
+
|
314 |
+
- **Number of examples**: {len(updated_rows)}
|
315 |
+
- **Audio files uploaded**: {uploaded_count}
|
316 |
+
- **Total dataset size**: {total_dataset_size:,} bytes
|
317 |
+
|
318 |
## Usage
|
319 |
|
320 |
```python
|
321 |
+
from datasets import load_dataset, Audio
|
322 |
+
|
323 |
+
# Load dataset
|
324 |
+
dataset = load_dataset("{repo_name}")
|
325 |
+
|
326 |
+
# Load audio data
|
327 |
+
dataset = dataset.cast_column("audio_path", Audio())
|
328 |
+
|
329 |
+
# Access first example
|
330 |
+
print(dataset[0]["text"])
|
331 |
+
print(dataset[0]["audio_path"])
|
332 |
+
```
|
333 |
+
|
334 |
+
## Loading with Audio Decoding
|
335 |
+
|
336 |
+
```python
|
337 |
+
from datasets import load_dataset, Audio
|
338 |
|
339 |
+
# Load with automatic audio decoding
|
340 |
dataset = load_dataset("{repo_name}")
|
341 |
+
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
|
342 |
+
|
343 |
+
# The audio column will contain the decoded audio arrays
|
344 |
+
audio_array = dataset[0]["audio_path"]["array"]
|
345 |
+
sampling_rate = dataset[0]["audio_path"]["sampling_rate"]
|
346 |
```
|
347 |
"""
|
348 |
|
|
|
361 |
|
362 |
readme_path.unlink() # Clean up temp file
|
363 |
|
364 |
+
return f"β
Dataset pushed to: https://huggingface.co/datasets/{repo_name}\nπ Uploaded {len(updated_rows)} examples and {uploaded_count} audio files"
|
365 |
|
366 |
except Exception as e:
|
367 |
return f"β Failed to push dataset: {e}"
|
scripts/push_to_huggingface.py
CHANGED
@@ -502,11 +502,11 @@ MIT License
|
|
502 |
return True
|
503 |
|
504 |
def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
|
505 |
-
"""Push dataset to Hugging Face Hub"""
|
506 |
logger.info(f"π Starting dataset push to {dataset_repo_name}")
|
507 |
|
508 |
try:
|
509 |
-
from huggingface_hub import create_repo
|
510 |
import json
|
511 |
|
512 |
# Determine full dataset repo name
|
@@ -529,15 +529,44 @@ MIT License
|
|
529 |
logger.error(f"β Dataset file not found: {dataset_path}")
|
530 |
return False
|
531 |
|
532 |
-
#
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
file_size = dataset_file.stat().st_size
|
537 |
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
upload_file(
|
540 |
-
path_or_fileobj=str(
|
541 |
path_in_repo="data.jsonl",
|
542 |
repo_id=dataset_repo_name,
|
543 |
repo_type="dataset",
|
@@ -545,7 +574,30 @@ MIT License
|
|
545 |
)
|
546 |
logger.info(f"β
Uploaded dataset file: {dataset_file.name}")
|
547 |
|
548 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
readme_content = f"""---
|
550 |
dataset_info:
|
551 |
features:
|
@@ -555,18 +607,17 @@ dataset_info:
|
|
555 |
dtype: string
|
556 |
splits:
|
557 |
- name: train
|
558 |
-
num_bytes: {
|
559 |
-
num_examples: {
|
560 |
-
download_size: {
|
561 |
-
dataset_size: {
|
562 |
tags:
|
563 |
- voxtral
|
564 |
- asr
|
565 |
-
- fine-tuning
|
566 |
-
- conversational
|
567 |
- speech-to-text
|
568 |
-
-
|
569 |
-
-
|
|
|
570 |
---
|
571 |
|
572 |
# Voxtral ASR Dataset
|
@@ -575,21 +626,53 @@ This dataset was created for fine-tuning Voxtral ASR models.
|
|
575 |
|
576 |
## Dataset Structure
|
577 |
|
578 |
-
- **audio_path**:
|
579 |
- **text**: Transcription of the audio
|
580 |
|
581 |
-
## Statistics
|
582 |
|
583 |
-
- Number of examples
|
584 |
-
-
|
|
|
585 |
|
586 |
## Usage
|
587 |
|
588 |
```python
|
589 |
-
from datasets import load_dataset
|
590 |
|
|
|
591 |
dataset = load_dataset("{dataset_repo_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
"""
|
594 |
|
595 |
# Upload README
|
@@ -609,13 +692,97 @@ dataset = load_dataset("{dataset_repo_name}")
|
|
609 |
|
610 |
logger.info(f"β
Dataset README uploaded")
|
611 |
logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
|
|
|
612 |
|
613 |
return True
|
614 |
|
615 |
except Exception as e:
|
616 |
logger.error(f"β Failed to push dataset: {e}")
|
617 |
return False
|
618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
619 |
def _load_training_config(self) -> Dict[str, Any]:
|
620 |
"""Load training configuration"""
|
621 |
config_path = self.model_path / "training_config.json"
|
@@ -656,6 +823,7 @@ def parse_args():
|
|
656 |
dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
|
657 |
dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
658 |
dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
|
|
|
659 |
|
660 |
return parser.parse_args()
|
661 |
|
@@ -710,15 +878,24 @@ def main():
|
|
710 |
private=args.private
|
711 |
)
|
712 |
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
|
|
|
|
719 |
else:
|
720 |
-
|
721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
|
723 |
except Exception as e:
|
724 |
logger.error(f"β Error during push: {e}")
|
|
|
502 |
return True
|
503 |
|
504 |
def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
|
505 |
+
"""Push dataset to Hugging Face Hub including audio files"""
|
506 |
logger.info(f"π Starting dataset push to {dataset_repo_name}")
|
507 |
|
508 |
try:
|
509 |
+
from huggingface_hub import create_repo, upload_file
|
510 |
import json
|
511 |
|
512 |
# Determine full dataset repo name
|
|
|
529 |
logger.error(f"β Dataset file not found: {dataset_path}")
|
530 |
return False
|
531 |
|
532 |
+
# Read and process the JSONL to collect audio files and update paths
|
533 |
+
audio_files = []
|
534 |
+
updated_rows = []
|
535 |
+
total_audio_size = 0
|
|
|
536 |
|
537 |
+
with open(dataset_file, 'r', encoding='utf-8') as f:
|
538 |
+
for line_num, line in enumerate(f):
|
539 |
+
try:
|
540 |
+
row = json.loads(line.strip())
|
541 |
+
audio_path = row.get("audio_path", "")
|
542 |
+
|
543 |
+
if audio_path:
|
544 |
+
audio_file = Path(audio_path)
|
545 |
+
if audio_file.exists():
|
546 |
+
# Store the original file for upload
|
547 |
+
audio_files.append(audio_file)
|
548 |
+
total_audio_size += audio_file.stat().st_size
|
549 |
+
|
550 |
+
# Update path to be relative for the dataset
|
551 |
+
row["audio_path"] = f"audio/{audio_file.name}"
|
552 |
+
else:
|
553 |
+
logger.warning(f"Audio file not found: {audio_path}")
|
554 |
+
row["audio_path"] = "" # Clear missing files
|
555 |
+
|
556 |
+
updated_rows.append(row)
|
557 |
+
except json.JSONDecodeError as e:
|
558 |
+
logger.warning(f"Invalid JSON on line {line_num + 1}: {e}")
|
559 |
+
continue
|
560 |
+
|
561 |
+
# Create updated JSONL with relative paths
|
562 |
+
temp_jsonl_path = dataset_file.parent / "temp_data.jsonl"
|
563 |
+
with open(temp_jsonl_path, "w", encoding="utf-8") as f:
|
564 |
+
for row in updated_rows:
|
565 |
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
566 |
+
|
567 |
+
# Upload the updated JSONL file
|
568 |
upload_file(
|
569 |
+
path_or_fileobj=str(temp_jsonl_path),
|
570 |
path_in_repo="data.jsonl",
|
571 |
repo_id=dataset_repo_name,
|
572 |
repo_type="dataset",
|
|
|
574 |
)
|
575 |
logger.info(f"β
Uploaded dataset file: {dataset_file.name}")
|
576 |
|
577 |
+
# Clean up temp file
|
578 |
+
temp_jsonl_path.unlink()
|
579 |
+
|
580 |
+
# Upload audio files
|
581 |
+
uploaded_count = 0
|
582 |
+
for audio_file in audio_files:
|
583 |
+
try:
|
584 |
+
remote_path = f"audio/{audio_file.name}"
|
585 |
+
upload_file(
|
586 |
+
path_or_fileobj=str(audio_file),
|
587 |
+
path_in_repo=remote_path,
|
588 |
+
repo_id=dataset_repo_name,
|
589 |
+
repo_type="dataset",
|
590 |
+
token=self.token
|
591 |
+
)
|
592 |
+
uploaded_count += 1
|
593 |
+
logger.info(f"β
Uploaded audio file: {audio_file.name}")
|
594 |
+
except Exception as e:
|
595 |
+
logger.error(f"β Failed to upload {audio_file.name}: {e}")
|
596 |
+
|
597 |
+
# Calculate total dataset size
|
598 |
+
total_dataset_size = dataset_file.stat().st_size + total_audio_size
|
599 |
+
|
600 |
+
# Create a comprehensive dataset README
|
601 |
readme_content = f"""---
|
602 |
dataset_info:
|
603 |
features:
|
|
|
607 |
dtype: string
|
608 |
splits:
|
609 |
- name: train
|
610 |
+
num_bytes: {dataset_file.stat().st_size}
|
611 |
+
num_examples: {len(updated_rows)}
|
612 |
+
download_size: {total_dataset_size}
|
613 |
+
dataset_size: {total_dataset_size}
|
614 |
tags:
|
615 |
- voxtral
|
616 |
- asr
|
|
|
|
|
617 |
- speech-to-text
|
618 |
+
- fine-tuning
|
619 |
+
- audio-dataset
|
620 |
+
- tonic
|
621 |
---
|
622 |
|
623 |
# Voxtral ASR Dataset
|
|
|
626 |
|
627 |
## Dataset Structure
|
628 |
|
629 |
+
- **audio_path**: Relative path to the audio file (stored in `audio/` directory)
|
630 |
- **text**: Transcription of the audio
|
631 |
|
632 |
+
## Dataset Statistics
|
633 |
|
634 |
+
- **Number of examples**: {len(updated_rows)}
|
635 |
+
- **Audio files uploaded**: {uploaded_count}
|
636 |
+
- **Total dataset size**: {total_dataset_size:,} bytes
|
637 |
|
638 |
## Usage
|
639 |
|
640 |
```python
|
641 |
+
from datasets import load_dataset, Audio
|
642 |
|
643 |
+
# Load dataset
|
644 |
dataset = load_dataset("{dataset_repo_name}")
|
645 |
+
|
646 |
+
# Load audio data
|
647 |
+
dataset = dataset.cast_column("audio_path", Audio())
|
648 |
+
|
649 |
+
# Access first example
|
650 |
+
print(dataset[0]["text"])
|
651 |
+
print(dataset[0]["audio_path"])
|
652 |
```
|
653 |
+
|
654 |
+
## Loading with Audio Decoding
|
655 |
+
|
656 |
+
```python
|
657 |
+
from datasets import load_dataset, Audio
|
658 |
+
|
659 |
+
# Load with automatic audio decoding
|
660 |
+
dataset = load_dataset("{dataset_repo_name}")
|
661 |
+
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
|
662 |
+
|
663 |
+
# The audio column will contain the decoded audio arrays
|
664 |
+
audio_array = dataset[0]["audio_path"]["array"]
|
665 |
+
sampling_rate = dataset[0]["audio_path"]["sampling_rate"]
|
666 |
+
```
|
667 |
+
|
668 |
+
## Dataset Features
|
669 |
+
|
670 |
+
This dataset contains audio files with corresponding transcriptions for Voxtral ASR model fine-tuning.
|
671 |
+
All audio files are stored in the `audio/` directory and referenced using relative paths in the dataset.
|
672 |
+
|
673 |
+
## License
|
674 |
+
|
675 |
+
This dataset is created for research and educational purposes.
|
676 |
"""
|
677 |
|
678 |
# Upload README
|
|
|
692 |
|
693 |
logger.info(f"β
Dataset README uploaded")
|
694 |
logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
|
695 |
+
logger.info(f"π Uploaded {len(updated_rows)} examples and {uploaded_count} audio files")
|
696 |
|
697 |
return True
|
698 |
|
699 |
except Exception as e:
|
700 |
logger.error(f"β Failed to push dataset: {e}")
|
701 |
return False
|
702 |
+
|
703 |
+
def test_dataset_push(self, dataset_path: str) -> bool:
|
704 |
+
"""Test dataset validation without uploading to Hugging Face Hub"""
|
705 |
+
logger.info(f"π§ͺ Testing dataset validation for {dataset_path}")
|
706 |
+
|
707 |
+
try:
|
708 |
+
# Read the dataset file
|
709 |
+
dataset_file = Path(dataset_path)
|
710 |
+
if not dataset_file.exists():
|
711 |
+
logger.error(f"β Dataset file not found: {dataset_path}")
|
712 |
+
return False
|
713 |
+
|
714 |
+
# Read and process the JSONL to validate audio files
|
715 |
+
audio_files = []
|
716 |
+
updated_rows = []
|
717 |
+
total_audio_size = 0
|
718 |
+
missing_files = []
|
719 |
+
invalid_json_lines = []
|
720 |
+
|
721 |
+
with open(dataset_file, 'r', encoding='utf-8') as f:
|
722 |
+
for line_num, line in enumerate(f):
|
723 |
+
try:
|
724 |
+
row = json.loads(line.strip())
|
725 |
+
audio_path = row.get("audio_path", "")
|
726 |
+
|
727 |
+
if audio_path:
|
728 |
+
audio_file = Path(audio_path)
|
729 |
+
if audio_file.exists():
|
730 |
+
# Store the file info for validation
|
731 |
+
audio_files.append(audio_file)
|
732 |
+
total_audio_size += audio_file.stat().st_size
|
733 |
+
else:
|
734 |
+
missing_files.append(str(audio_path))
|
735 |
+
|
736 |
+
updated_rows.append(row)
|
737 |
+
except json.JSONDecodeError as e:
|
738 |
+
invalid_json_lines.append(f"Line {line_num + 1}: {e}")
|
739 |
+
continue
|
740 |
+
|
741 |
+
# Report validation results
|
742 |
+
logger.info("π Dataset Validation Results:")
|
743 |
+
logger.info(f" - Total examples: {len(updated_rows)}")
|
744 |
+
logger.info(f" - Valid audio files: {len(audio_files)}")
|
745 |
+
logger.info(f" - Total audio size: {total_audio_size:,} bytes")
|
746 |
+
logger.info(f" - Missing audio files: {len(missing_files)}")
|
747 |
+
logger.info(f" - Invalid JSON lines: {len(invalid_json_lines)}")
|
748 |
+
|
749 |
+
if missing_files:
|
750 |
+
logger.warning("β οΈ Missing audio files:")
|
751 |
+
for missing in missing_files[:5]: # Show first 5
|
752 |
+
logger.warning(f" - {missing}")
|
753 |
+
if len(missing_files) > 5:
|
754 |
+
logger.warning(f" ... and {len(missing_files) - 5} more")
|
755 |
+
|
756 |
+
if invalid_json_lines:
|
757 |
+
logger.warning("β οΈ Invalid JSON lines:")
|
758 |
+
for invalid in invalid_json_lines[:3]: # Show first 3
|
759 |
+
logger.warning(f" - {invalid}")
|
760 |
+
if len(invalid_json_lines) > 3:
|
761 |
+
logger.warning(f" ... and {len(invalid_json_lines) - 3} more")
|
762 |
+
|
763 |
+
# Show sample of how paths will be converted
|
764 |
+
if audio_files:
|
765 |
+
logger.info("π Path conversion preview:")
|
766 |
+
for audio_file in audio_files[:3]: # Show first 3
|
767 |
+
logger.info(f" - {str(audio_file)} β audio/{audio_file.name}")
|
768 |
+
|
769 |
+
# Overall validation status
|
770 |
+
if len(updated_rows) == 0:
|
771 |
+
logger.error("β No valid examples found in dataset")
|
772 |
+
return False
|
773 |
+
|
774 |
+
if len(missing_files) > 0:
|
775 |
+
logger.warning("β οΈ Some audio files are missing - they will be skipped during upload")
|
776 |
+
else:
|
777 |
+
logger.info("β
All audio files found and valid")
|
778 |
+
|
779 |
+
logger.info("β
Dataset validation completed successfully!")
|
780 |
+
return True
|
781 |
+
|
782 |
+
except Exception as e:
|
783 |
+
logger.error(f"β Failed to validate dataset: {e}")
|
784 |
+
return False
|
785 |
+
|
786 |
def _load_training_config(self) -> Dict[str, Any]:
|
787 |
"""Load training configuration"""
|
788 |
config_path = self.model_path / "training_config.json"
|
|
|
823 |
dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
|
824 |
dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
825 |
dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
|
826 |
+
dataset_parser.add_argument('--test', action='store_true', help='Test mode - validate dataset without uploading')
|
827 |
|
828 |
return parser.parse_args()
|
829 |
|
|
|
878 |
private=args.private
|
879 |
)
|
880 |
|
881 |
+
if getattr(args, 'test', False):
|
882 |
+
# Test mode - validate dataset without uploading
|
883 |
+
success = pusher.test_dataset_push(args.dataset_path)
|
884 |
+
if success:
|
885 |
+
logger.info("β
Dataset validation completed successfully!")
|
886 |
+
else:
|
887 |
+
logger.error("β Dataset validation failed!")
|
888 |
+
return 1
|
889 |
else:
|
890 |
+
# Push dataset
|
891 |
+
success = pusher.push_dataset(args.dataset_path, args.repo_name)
|
892 |
+
|
893 |
+
if success:
|
894 |
+
logger.info("β
Dataset push completed successfully!")
|
895 |
+
logger.info(f"π View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
|
896 |
+
else:
|
897 |
+
logger.error("β Dataset push failed!")
|
898 |
+
return 1
|
899 |
|
900 |
except Exception as e:
|
901 |
logger.error(f"β Error during push: {e}")
|