Spaces:
Sleeping
Sleeping
Joseph Pollack
commited on
improves dataset push to huggingface
Browse files- README.md +1 -1
- interface.py +103 -10
- scripts/push_to_huggingface.py +209 -32
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: VoxFactory
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
|
|
|
| 1 |
---
|
| 2 |
title: VoxFactory
|
| 3 |
+
emoji: π¬οΈ
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
interface.py
CHANGED
|
@@ -177,11 +177,12 @@ def _save_uploaded_dataset(files: list, transcripts: list[str]) -> str:
|
|
| 177 |
|
| 178 |
|
| 179 |
def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
|
| 180 |
-
"""Push dataset to Hugging Face Hub"""
|
| 181 |
try:
|
| 182 |
from huggingface_hub import HfApi, create_repo
|
| 183 |
import json
|
| 184 |
from pathlib import Path
|
|
|
|
| 185 |
|
| 186 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 187 |
|
|
@@ -210,16 +211,74 @@ def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") ->
|
|
| 210 |
if not jsonl_file.exists():
|
| 211 |
return f"β Dataset file not found: {jsonl_path}"
|
| 212 |
|
| 213 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
api.upload_file(
|
| 215 |
-
path_or_fileobj=str(
|
| 216 |
path_in_repo="data.jsonl",
|
| 217 |
repo_id=repo_name,
|
| 218 |
repo_type="dataset",
|
| 219 |
token=token
|
| 220 |
)
|
| 221 |
|
| 222 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
readme_content = f"""---
|
| 224 |
dataset_info:
|
| 225 |
features:
|
|
@@ -230,9 +289,15 @@ dataset_info:
|
|
| 230 |
splits:
|
| 231 |
- name: train
|
| 232 |
num_bytes: {jsonl_file.stat().st_size}
|
| 233 |
-
num_examples: {
|
| 234 |
-
download_size: {
|
| 235 |
-
dataset_size: {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
---
|
| 237 |
|
| 238 |
# Voxtral ASR Dataset
|
|
@@ -241,15 +306,43 @@ This dataset was created using the Voxtral ASR Fine-tuning Interface.
|
|
| 241 |
|
| 242 |
## Dataset Structure
|
| 243 |
|
| 244 |
-
- **audio_path**:
|
| 245 |
- **text**: Transcription of the audio
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
## Usage
|
| 248 |
|
| 249 |
```python
|
| 250 |
-
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
|
|
|
| 252 |
dataset = load_dataset("{repo_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
```
|
| 254 |
"""
|
| 255 |
|
|
@@ -268,7 +361,7 @@ dataset = load_dataset("{repo_name}")
|
|
| 268 |
|
| 269 |
readme_path.unlink() # Clean up temp file
|
| 270 |
|
| 271 |
-
return f"β
Dataset pushed to: https://huggingface.co/datasets/{repo_name}"
|
| 272 |
|
| 273 |
except Exception as e:
|
| 274 |
return f"β Failed to push dataset: {e}"
|
|
|
|
| 177 |
|
| 178 |
|
| 179 |
def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
|
| 180 |
+
"""Push dataset to Hugging Face Hub including audio files"""
|
| 181 |
try:
|
| 182 |
from huggingface_hub import HfApi, create_repo
|
| 183 |
import json
|
| 184 |
from pathlib import Path
|
| 185 |
+
import os
|
| 186 |
|
| 187 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 188 |
|
|
|
|
| 211 |
if not jsonl_file.exists():
|
| 212 |
return f"β Dataset file not found: {jsonl_path}"
|
| 213 |
|
| 214 |
+
# Read and process the JSONL to collect audio files and update paths
|
| 215 |
+
audio_files = []
|
| 216 |
+
updated_rows = []
|
| 217 |
+
total_audio_size = 0
|
| 218 |
+
|
| 219 |
+
with open(jsonl_file, "r", encoding="utf-8") as f:
|
| 220 |
+
for line_num, line in enumerate(f):
|
| 221 |
+
try:
|
| 222 |
+
row = json.loads(line.strip())
|
| 223 |
+
audio_path = row.get("audio_path", "")
|
| 224 |
+
|
| 225 |
+
if audio_path:
|
| 226 |
+
audio_file = Path(audio_path)
|
| 227 |
+
if audio_file.exists():
|
| 228 |
+
# Store the original file for upload
|
| 229 |
+
audio_files.append(audio_file)
|
| 230 |
+
total_audio_size += audio_file.stat().st_size
|
| 231 |
+
|
| 232 |
+
# Update path to be relative for the dataset
|
| 233 |
+
row["audio_path"] = f"audio/{audio_file.name}"
|
| 234 |
+
else:
|
| 235 |
+
print(f"β οΈ Warning: Audio file not found: {audio_path}")
|
| 236 |
+
row["audio_path"] = "" # Clear missing files
|
| 237 |
+
|
| 238 |
+
updated_rows.append(row)
|
| 239 |
+
except json.JSONDecodeError as e:
|
| 240 |
+
print(f"β οΈ Warning: Invalid JSON on line {line_num + 1}: {e}")
|
| 241 |
+
continue
|
| 242 |
+
|
| 243 |
+
# Create updated JSONL with relative paths
|
| 244 |
+
temp_jsonl_path = jsonl_file.parent / "temp_data.jsonl"
|
| 245 |
+
with open(temp_jsonl_path, "w", encoding="utf-8") as f:
|
| 246 |
+
for row in updated_rows:
|
| 247 |
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 248 |
+
|
| 249 |
+
# Upload the updated JSONL file
|
| 250 |
api.upload_file(
|
| 251 |
+
path_or_fileobj=str(temp_jsonl_path),
|
| 252 |
path_in_repo="data.jsonl",
|
| 253 |
repo_id=repo_name,
|
| 254 |
repo_type="dataset",
|
| 255 |
token=token
|
| 256 |
)
|
| 257 |
|
| 258 |
+
# Clean up temp file
|
| 259 |
+
temp_jsonl_path.unlink()
|
| 260 |
+
|
| 261 |
+
# Upload audio files
|
| 262 |
+
uploaded_count = 0
|
| 263 |
+
for audio_file in audio_files:
|
| 264 |
+
try:
|
| 265 |
+
remote_path = f"audio/{audio_file.name}"
|
| 266 |
+
api.upload_file(
|
| 267 |
+
path_or_fileobj=str(audio_file),
|
| 268 |
+
path_in_repo=remote_path,
|
| 269 |
+
repo_id=repo_name,
|
| 270 |
+
repo_type="dataset",
|
| 271 |
+
token=token
|
| 272 |
+
)
|
| 273 |
+
uploaded_count += 1
|
| 274 |
+
print(f"β
Uploaded audio file: {audio_file.name}")
|
| 275 |
+
except Exception as e:
|
| 276 |
+
print(f"β Failed to upload {audio_file.name}: {e}")
|
| 277 |
+
|
| 278 |
+
# Calculate total dataset size
|
| 279 |
+
total_dataset_size = jsonl_file.stat().st_size + total_audio_size
|
| 280 |
+
|
| 281 |
+
# Create README for the dataset
|
| 282 |
readme_content = f"""---
|
| 283 |
dataset_info:
|
| 284 |
features:
|
|
|
|
| 289 |
splits:
|
| 290 |
- name: train
|
| 291 |
num_bytes: {jsonl_file.stat().st_size}
|
| 292 |
+
num_examples: {len(updated_rows)}
|
| 293 |
+
download_size: {total_dataset_size}
|
| 294 |
+
dataset_size: {total_dataset_size}
|
| 295 |
+
tags:
|
| 296 |
+
- voxtral
|
| 297 |
+
- asr
|
| 298 |
+
- speech-to-text
|
| 299 |
+
- fine-tuning
|
| 300 |
+
- audio-dataset
|
| 301 |
---
|
| 302 |
|
| 303 |
# Voxtral ASR Dataset
|
|
|
|
| 306 |
|
| 307 |
## Dataset Structure
|
| 308 |
|
| 309 |
+
- **audio_path**: Relative path to the audio file (stored in `audio/` directory)
|
| 310 |
- **text**: Transcription of the audio
|
| 311 |
|
| 312 |
+
## Dataset Statistics
|
| 313 |
+
|
| 314 |
+
- **Number of examples**: {len(updated_rows)}
|
| 315 |
+
- **Audio files uploaded**: {uploaded_count}
|
| 316 |
+
- **Total dataset size**: {total_dataset_size:,} bytes
|
| 317 |
+
|
| 318 |
## Usage
|
| 319 |
|
| 320 |
```python
|
| 321 |
+
from datasets import load_dataset, Audio
|
| 322 |
+
|
| 323 |
+
# Load dataset
|
| 324 |
+
dataset = load_dataset("{repo_name}")
|
| 325 |
+
|
| 326 |
+
# Load audio data
|
| 327 |
+
dataset = dataset.cast_column("audio_path", Audio())
|
| 328 |
+
|
| 329 |
+
# Access first example
|
| 330 |
+
print(dataset[0]["text"])
|
| 331 |
+
print(dataset[0]["audio_path"])
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
## Loading with Audio Decoding
|
| 335 |
+
|
| 336 |
+
```python
|
| 337 |
+
from datasets import load_dataset, Audio
|
| 338 |
|
| 339 |
+
# Load with automatic audio decoding
|
| 340 |
dataset = load_dataset("{repo_name}")
|
| 341 |
+
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
|
| 342 |
+
|
| 343 |
+
# The audio column will contain the decoded audio arrays
|
| 344 |
+
audio_array = dataset[0]["audio_path"]["array"]
|
| 345 |
+
sampling_rate = dataset[0]["audio_path"]["sampling_rate"]
|
| 346 |
```
|
| 347 |
"""
|
| 348 |
|
|
|
|
| 361 |
|
| 362 |
readme_path.unlink() # Clean up temp file
|
| 363 |
|
| 364 |
+
return f"β
Dataset pushed to: https://huggingface.co/datasets/{repo_name}\nπ Uploaded {len(updated_rows)} examples and {uploaded_count} audio files"
|
| 365 |
|
| 366 |
except Exception as e:
|
| 367 |
return f"β Failed to push dataset: {e}"
|
scripts/push_to_huggingface.py
CHANGED
|
@@ -502,11 +502,11 @@ MIT License
|
|
| 502 |
return True
|
| 503 |
|
| 504 |
def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
|
| 505 |
-
"""Push dataset to Hugging Face Hub"""
|
| 506 |
logger.info(f"π Starting dataset push to {dataset_repo_name}")
|
| 507 |
|
| 508 |
try:
|
| 509 |
-
from huggingface_hub import create_repo
|
| 510 |
import json
|
| 511 |
|
| 512 |
# Determine full dataset repo name
|
|
@@ -529,15 +529,44 @@ MIT License
|
|
| 529 |
logger.error(f"β Dataset file not found: {dataset_path}")
|
| 530 |
return False
|
| 531 |
|
| 532 |
-
#
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
file_size = dataset_file.stat().st_size
|
| 537 |
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
upload_file(
|
| 540 |
-
path_or_fileobj=str(
|
| 541 |
path_in_repo="data.jsonl",
|
| 542 |
repo_id=dataset_repo_name,
|
| 543 |
repo_type="dataset",
|
|
@@ -545,7 +574,30 @@ MIT License
|
|
| 545 |
)
|
| 546 |
logger.info(f"β
Uploaded dataset file: {dataset_file.name}")
|
| 547 |
|
| 548 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
readme_content = f"""---
|
| 550 |
dataset_info:
|
| 551 |
features:
|
|
@@ -555,18 +607,17 @@ dataset_info:
|
|
| 555 |
dtype: string
|
| 556 |
splits:
|
| 557 |
- name: train
|
| 558 |
-
num_bytes: {
|
| 559 |
-
num_examples: {
|
| 560 |
-
download_size: {
|
| 561 |
-
dataset_size: {
|
| 562 |
tags:
|
| 563 |
- voxtral
|
| 564 |
- asr
|
| 565 |
-
- fine-tuning
|
| 566 |
-
- conversational
|
| 567 |
- speech-to-text
|
| 568 |
-
-
|
| 569 |
-
-
|
|
|
|
| 570 |
---
|
| 571 |
|
| 572 |
# Voxtral ASR Dataset
|
|
@@ -575,21 +626,53 @@ This dataset was created for fine-tuning Voxtral ASR models.
|
|
| 575 |
|
| 576 |
## Dataset Structure
|
| 577 |
|
| 578 |
-
- **audio_path**:
|
| 579 |
- **text**: Transcription of the audio
|
| 580 |
|
| 581 |
-
## Statistics
|
| 582 |
|
| 583 |
-
- Number of examples
|
| 584 |
-
-
|
|
|
|
| 585 |
|
| 586 |
## Usage
|
| 587 |
|
| 588 |
```python
|
| 589 |
-
from datasets import load_dataset
|
| 590 |
|
|
|
|
| 591 |
dataset = load_dataset("{dataset_repo_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
"""
|
| 594 |
|
| 595 |
# Upload README
|
|
@@ -609,13 +692,97 @@ dataset = load_dataset("{dataset_repo_name}")
|
|
| 609 |
|
| 610 |
logger.info(f"β
Dataset README uploaded")
|
| 611 |
logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
|
|
|
|
| 612 |
|
| 613 |
return True
|
| 614 |
|
| 615 |
except Exception as e:
|
| 616 |
logger.error(f"β Failed to push dataset: {e}")
|
| 617 |
return False
|
| 618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
def _load_training_config(self) -> Dict[str, Any]:
|
| 620 |
"""Load training configuration"""
|
| 621 |
config_path = self.model_path / "training_config.json"
|
|
@@ -656,6 +823,7 @@ def parse_args():
|
|
| 656 |
dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
|
| 657 |
dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
| 658 |
dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
|
|
|
|
| 659 |
|
| 660 |
return parser.parse_args()
|
| 661 |
|
|
@@ -710,15 +878,24 @@ def main():
|
|
| 710 |
private=args.private
|
| 711 |
)
|
| 712 |
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
|
|
|
|
|
|
| 719 |
else:
|
| 720 |
-
|
| 721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
|
| 723 |
except Exception as e:
|
| 724 |
logger.error(f"β Error during push: {e}")
|
|
|
|
| 502 |
return True
|
| 503 |
|
| 504 |
def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
|
| 505 |
+
"""Push dataset to Hugging Face Hub including audio files"""
|
| 506 |
logger.info(f"π Starting dataset push to {dataset_repo_name}")
|
| 507 |
|
| 508 |
try:
|
| 509 |
+
from huggingface_hub import create_repo, upload_file
|
| 510 |
import json
|
| 511 |
|
| 512 |
# Determine full dataset repo name
|
|
|
|
| 529 |
logger.error(f"β Dataset file not found: {dataset_path}")
|
| 530 |
return False
|
| 531 |
|
| 532 |
+
# Read and process the JSONL to collect audio files and update paths
|
| 533 |
+
audio_files = []
|
| 534 |
+
updated_rows = []
|
| 535 |
+
total_audio_size = 0
|
|
|
|
| 536 |
|
| 537 |
+
with open(dataset_file, 'r', encoding='utf-8') as f:
|
| 538 |
+
for line_num, line in enumerate(f):
|
| 539 |
+
try:
|
| 540 |
+
row = json.loads(line.strip())
|
| 541 |
+
audio_path = row.get("audio_path", "")
|
| 542 |
+
|
| 543 |
+
if audio_path:
|
| 544 |
+
audio_file = Path(audio_path)
|
| 545 |
+
if audio_file.exists():
|
| 546 |
+
# Store the original file for upload
|
| 547 |
+
audio_files.append(audio_file)
|
| 548 |
+
total_audio_size += audio_file.stat().st_size
|
| 549 |
+
|
| 550 |
+
# Update path to be relative for the dataset
|
| 551 |
+
row["audio_path"] = f"audio/{audio_file.name}"
|
| 552 |
+
else:
|
| 553 |
+
logger.warning(f"Audio file not found: {audio_path}")
|
| 554 |
+
row["audio_path"] = "" # Clear missing files
|
| 555 |
+
|
| 556 |
+
updated_rows.append(row)
|
| 557 |
+
except json.JSONDecodeError as e:
|
| 558 |
+
logger.warning(f"Invalid JSON on line {line_num + 1}: {e}")
|
| 559 |
+
continue
|
| 560 |
+
|
| 561 |
+
# Create updated JSONL with relative paths
|
| 562 |
+
temp_jsonl_path = dataset_file.parent / "temp_data.jsonl"
|
| 563 |
+
with open(temp_jsonl_path, "w", encoding="utf-8") as f:
|
| 564 |
+
for row in updated_rows:
|
| 565 |
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 566 |
+
|
| 567 |
+
# Upload the updated JSONL file
|
| 568 |
upload_file(
|
| 569 |
+
path_or_fileobj=str(temp_jsonl_path),
|
| 570 |
path_in_repo="data.jsonl",
|
| 571 |
repo_id=dataset_repo_name,
|
| 572 |
repo_type="dataset",
|
|
|
|
| 574 |
)
|
| 575 |
logger.info(f"β
Uploaded dataset file: {dataset_file.name}")
|
| 576 |
|
| 577 |
+
# Clean up temp file
|
| 578 |
+
temp_jsonl_path.unlink()
|
| 579 |
+
|
| 580 |
+
# Upload audio files
|
| 581 |
+
uploaded_count = 0
|
| 582 |
+
for audio_file in audio_files:
|
| 583 |
+
try:
|
| 584 |
+
remote_path = f"audio/{audio_file.name}"
|
| 585 |
+
upload_file(
|
| 586 |
+
path_or_fileobj=str(audio_file),
|
| 587 |
+
path_in_repo=remote_path,
|
| 588 |
+
repo_id=dataset_repo_name,
|
| 589 |
+
repo_type="dataset",
|
| 590 |
+
token=self.token
|
| 591 |
+
)
|
| 592 |
+
uploaded_count += 1
|
| 593 |
+
logger.info(f"β
Uploaded audio file: {audio_file.name}")
|
| 594 |
+
except Exception as e:
|
| 595 |
+
logger.error(f"β Failed to upload {audio_file.name}: {e}")
|
| 596 |
+
|
| 597 |
+
# Calculate total dataset size
|
| 598 |
+
total_dataset_size = dataset_file.stat().st_size + total_audio_size
|
| 599 |
+
|
| 600 |
+
# Create a comprehensive dataset README
|
| 601 |
readme_content = f"""---
|
| 602 |
dataset_info:
|
| 603 |
features:
|
|
|
|
| 607 |
dtype: string
|
| 608 |
splits:
|
| 609 |
- name: train
|
| 610 |
+
num_bytes: {dataset_file.stat().st_size}
|
| 611 |
+
num_examples: {len(updated_rows)}
|
| 612 |
+
download_size: {total_dataset_size}
|
| 613 |
+
dataset_size: {total_dataset_size}
|
| 614 |
tags:
|
| 615 |
- voxtral
|
| 616 |
- asr
|
|
|
|
|
|
|
| 617 |
- speech-to-text
|
| 618 |
+
- fine-tuning
|
| 619 |
+
- audio-dataset
|
| 620 |
+
- tonic
|
| 621 |
---
|
| 622 |
|
| 623 |
# Voxtral ASR Dataset
|
|
|
|
| 626 |
|
| 627 |
## Dataset Structure
|
| 628 |
|
| 629 |
+
- **audio_path**: Relative path to the audio file (stored in `audio/` directory)
|
| 630 |
- **text**: Transcription of the audio
|
| 631 |
|
| 632 |
+
## Dataset Statistics
|
| 633 |
|
| 634 |
+
- **Number of examples**: {len(updated_rows)}
|
| 635 |
+
- **Audio files uploaded**: {uploaded_count}
|
| 636 |
+
- **Total dataset size**: {total_dataset_size:,} bytes
|
| 637 |
|
| 638 |
## Usage
|
| 639 |
|
| 640 |
```python
|
| 641 |
+
from datasets import load_dataset, Audio
|
| 642 |
|
| 643 |
+
# Load dataset
|
| 644 |
dataset = load_dataset("{dataset_repo_name}")
|
| 645 |
+
|
| 646 |
+
# Load audio data
|
| 647 |
+
dataset = dataset.cast_column("audio_path", Audio())
|
| 648 |
+
|
| 649 |
+
# Access first example
|
| 650 |
+
print(dataset[0]["text"])
|
| 651 |
+
print(dataset[0]["audio_path"])
|
| 652 |
```
|
| 653 |
+
|
| 654 |
+
## Loading with Audio Decoding
|
| 655 |
+
|
| 656 |
+
```python
|
| 657 |
+
from datasets import load_dataset, Audio
|
| 658 |
+
|
| 659 |
+
# Load with automatic audio decoding
|
| 660 |
+
dataset = load_dataset("{dataset_repo_name}")
|
| 661 |
+
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
|
| 662 |
+
|
| 663 |
+
# The audio column will contain the decoded audio arrays
|
| 664 |
+
audio_array = dataset[0]["audio_path"]["array"]
|
| 665 |
+
sampling_rate = dataset[0]["audio_path"]["sampling_rate"]
|
| 666 |
+
```
|
| 667 |
+
|
| 668 |
+
## Dataset Features
|
| 669 |
+
|
| 670 |
+
This dataset contains audio files with corresponding transcriptions for Voxtral ASR model fine-tuning.
|
| 671 |
+
All audio files are stored in the `audio/` directory and referenced using relative paths in the dataset.
|
| 672 |
+
|
| 673 |
+
## License
|
| 674 |
+
|
| 675 |
+
This dataset is created for research and educational purposes.
|
| 676 |
"""
|
| 677 |
|
| 678 |
# Upload README
|
|
|
|
| 692 |
|
| 693 |
logger.info(f"β
Dataset README uploaded")
|
| 694 |
logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
|
| 695 |
+
logger.info(f"π Uploaded {len(updated_rows)} examples and {uploaded_count} audio files")
|
| 696 |
|
| 697 |
return True
|
| 698 |
|
| 699 |
except Exception as e:
|
| 700 |
logger.error(f"β Failed to push dataset: {e}")
|
| 701 |
return False
|
| 702 |
+
|
| 703 |
+
def test_dataset_push(self, dataset_path: str) -> bool:
|
| 704 |
+
"""Test dataset validation without uploading to Hugging Face Hub"""
|
| 705 |
+
logger.info(f"π§ͺ Testing dataset validation for {dataset_path}")
|
| 706 |
+
|
| 707 |
+
try:
|
| 708 |
+
# Read the dataset file
|
| 709 |
+
dataset_file = Path(dataset_path)
|
| 710 |
+
if not dataset_file.exists():
|
| 711 |
+
logger.error(f"β Dataset file not found: {dataset_path}")
|
| 712 |
+
return False
|
| 713 |
+
|
| 714 |
+
# Read and process the JSONL to validate audio files
|
| 715 |
+
audio_files = []
|
| 716 |
+
updated_rows = []
|
| 717 |
+
total_audio_size = 0
|
| 718 |
+
missing_files = []
|
| 719 |
+
invalid_json_lines = []
|
| 720 |
+
|
| 721 |
+
with open(dataset_file, 'r', encoding='utf-8') as f:
|
| 722 |
+
for line_num, line in enumerate(f):
|
| 723 |
+
try:
|
| 724 |
+
row = json.loads(line.strip())
|
| 725 |
+
audio_path = row.get("audio_path", "")
|
| 726 |
+
|
| 727 |
+
if audio_path:
|
| 728 |
+
audio_file = Path(audio_path)
|
| 729 |
+
if audio_file.exists():
|
| 730 |
+
# Store the file info for validation
|
| 731 |
+
audio_files.append(audio_file)
|
| 732 |
+
total_audio_size += audio_file.stat().st_size
|
| 733 |
+
else:
|
| 734 |
+
missing_files.append(str(audio_path))
|
| 735 |
+
|
| 736 |
+
updated_rows.append(row)
|
| 737 |
+
except json.JSONDecodeError as e:
|
| 738 |
+
invalid_json_lines.append(f"Line {line_num + 1}: {e}")
|
| 739 |
+
continue
|
| 740 |
+
|
| 741 |
+
# Report validation results
|
| 742 |
+
logger.info("π Dataset Validation Results:")
|
| 743 |
+
logger.info(f" - Total examples: {len(updated_rows)}")
|
| 744 |
+
logger.info(f" - Valid audio files: {len(audio_files)}")
|
| 745 |
+
logger.info(f" - Total audio size: {total_audio_size:,} bytes")
|
| 746 |
+
logger.info(f" - Missing audio files: {len(missing_files)}")
|
| 747 |
+
logger.info(f" - Invalid JSON lines: {len(invalid_json_lines)}")
|
| 748 |
+
|
| 749 |
+
if missing_files:
|
| 750 |
+
logger.warning("β οΈ Missing audio files:")
|
| 751 |
+
for missing in missing_files[:5]: # Show first 5
|
| 752 |
+
logger.warning(f" - {missing}")
|
| 753 |
+
if len(missing_files) > 5:
|
| 754 |
+
logger.warning(f" ... and {len(missing_files) - 5} more")
|
| 755 |
+
|
| 756 |
+
if invalid_json_lines:
|
| 757 |
+
logger.warning("β οΈ Invalid JSON lines:")
|
| 758 |
+
for invalid in invalid_json_lines[:3]: # Show first 3
|
| 759 |
+
logger.warning(f" - {invalid}")
|
| 760 |
+
if len(invalid_json_lines) > 3:
|
| 761 |
+
logger.warning(f" ... and {len(invalid_json_lines) - 3} more")
|
| 762 |
+
|
| 763 |
+
# Show sample of how paths will be converted
|
| 764 |
+
if audio_files:
|
| 765 |
+
logger.info("π Path conversion preview:")
|
| 766 |
+
for audio_file in audio_files[:3]: # Show first 3
|
| 767 |
+
logger.info(f" - {str(audio_file)} β audio/{audio_file.name}")
|
| 768 |
+
|
| 769 |
+
# Overall validation status
|
| 770 |
+
if len(updated_rows) == 0:
|
| 771 |
+
logger.error("β No valid examples found in dataset")
|
| 772 |
+
return False
|
| 773 |
+
|
| 774 |
+
if len(missing_files) > 0:
|
| 775 |
+
logger.warning("β οΈ Some audio files are missing - they will be skipped during upload")
|
| 776 |
+
else:
|
| 777 |
+
logger.info("β
All audio files found and valid")
|
| 778 |
+
|
| 779 |
+
logger.info("β
Dataset validation completed successfully!")
|
| 780 |
+
return True
|
| 781 |
+
|
| 782 |
+
except Exception as e:
|
| 783 |
+
logger.error(f"β Failed to validate dataset: {e}")
|
| 784 |
+
return False
|
| 785 |
+
|
| 786 |
def _load_training_config(self) -> Dict[str, Any]:
|
| 787 |
"""Load training configuration"""
|
| 788 |
config_path = self.model_path / "training_config.json"
|
|
|
|
| 823 |
dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
|
| 824 |
dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
| 825 |
dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
|
| 826 |
+
dataset_parser.add_argument('--test', action='store_true', help='Test mode - validate dataset without uploading')
|
| 827 |
|
| 828 |
return parser.parse_args()
|
| 829 |
|
|
|
|
| 878 |
private=args.private
|
| 879 |
)
|
| 880 |
|
| 881 |
+
if getattr(args, 'test', False):
|
| 882 |
+
# Test mode - validate dataset without uploading
|
| 883 |
+
success = pusher.test_dataset_push(args.dataset_path)
|
| 884 |
+
if success:
|
| 885 |
+
logger.info("β
Dataset validation completed successfully!")
|
| 886 |
+
else:
|
| 887 |
+
logger.error("β Dataset validation failed!")
|
| 888 |
+
return 1
|
| 889 |
else:
|
| 890 |
+
# Push dataset
|
| 891 |
+
success = pusher.push_dataset(args.dataset_path, args.repo_name)
|
| 892 |
+
|
| 893 |
+
if success:
|
| 894 |
+
logger.info("β
Dataset push completed successfully!")
|
| 895 |
+
logger.info(f"π View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
|
| 896 |
+
else:
|
| 897 |
+
logger.error("β Dataset push failed!")
|
| 898 |
+
return 1
|
| 899 |
|
| 900 |
except Exception as e:
|
| 901 |
logger.error(f"β Error during push: {e}")
|