Spaces:

Tonic
/

VoxFactory

Running

App Files Files Community

Joseph Pollack commited on Sep 13

Commit

9f8cd0c

unverified ·

1 Parent(s): b82e5c5

adds model card flow

Browse files

Files changed (2) hide show

scripts/push_to_huggingface.py +11 -4
templates/model_card.md +35 -290

scripts/push_to_huggingface.py CHANGED Viewed

@@ -294,7 +294,11 @@ class HuggingFacePusher:
             # Create variables for the model card
             variables = create_default_variables()
             # Update with actual values
             variables.update({
                 "repo_name": self.repo_id,
@@ -305,7 +309,10 @@ class HuggingFacePusher:
                 "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
                 "training_config_type": self.training_config_type or "Custom Configuration",
                 "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
-                "dataset_name": self.dataset_name or "Custom Dataset",
                 "trainer_type": self.trainer_type or "SFTTrainer",
                 "batch_size": str(self.batch_size) if self.batch_size else "8",
                 "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"),
@@ -576,7 +583,7 @@ MIT License
         # Create and upload model card
         model_card = self.create_model_card(training_config, results)
         model_card_path = Path("temp_model_card.md")
-        with open(model_card_path, "w") as f:
             f.write(model_card)
         try:
@@ -779,7 +786,7 @@ This dataset is created for research and educational purposes.
             # Upload README
             readme_path = dataset_file.parent / "README.md"
-            with open(readme_path, "w") as f:
                 f.write(readme_content)
             upload_file(

             # Create variables for the model card
             variables = create_default_variables()
+            # Determine whether dataset_name looks like a valid Hub dataset id (owner/dataset)
+            hub_dataset = (self.dataset_name or "").strip()
+            has_hub_dataset_id = bool(hub_dataset and "/" in hub_dataset and " " not in hub_dataset and len(hub_dataset.split("/")) == 2)
             # Update with actual values
             variables.update({
                 "repo_name": self.repo_id,
                 "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
                 "training_config_type": self.training_config_type or "Custom Configuration",
                 "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
+                "dataset_name": hub_dataset if hub_dataset else "",
+                "has_hub_dataset_id": has_hub_dataset_id,
+                # Only include model-index when a dataset is provided or when metrics are meaningful
+                "include_model_index": bool(hub_dataset),
                 "trainer_type": self.trainer_type or "SFTTrainer",
                 "batch_size": str(self.batch_size) if self.batch_size else "8",
                 "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"),
         # Create and upload model card
         model_card = self.create_model_card(training_config, results)
         model_card_path = Path("temp_model_card.md")
+        with open(model_card_path, "w", encoding="utf-8") as f:
             f.write(model_card)
         try:
             # Upload README
             readme_path = dataset_file.parent / "README.md"
+            with open(readme_path, "w", encoding="utf-8") as f:
                 f.write(readme_content)
             upload_file(

templates/model_card.md CHANGED Viewed

@@ -1,103 +1,19 @@
 ---
-language:
-- en
-- fr
 license: apache-2.0
-library_name: transformers
 tags:
 - voxtral
-- fine-tuned
-- text-generation
-- tonic
-{{#if quantized_models}}- quantized{{/if}}
-pipeline_tag: text-generation
 base_model: {{base_model}}
-{{#if dataset_name}}
 datasets:
 - {{dataset_name}}
 {{/if}}
-{{#if quantized_models}}
-model-index:
-- name: {{model_name}}
-  results:
-  - task:
-      type: text-generation
-    dataset:
-      name: {{dataset_name}}
-      type: {{dataset_name}}
-    metrics:
-    - name: Training Loss
-      type: loss
-      value: "{{training_loss|default:'N/A'}}"
-    - name: Validation Loss
-      type: loss
-      value: "{{validation_loss|default:'N/A'}}"
-    - name: Perplexity
-      type: perplexity
-      value: "{{perplexity|default:'N/A'}}"
-- name: {{model_name}} (int8 quantized)
-  results:
-  - task:
-      type: text-generation
-    dataset:
-      name: {{dataset_name}}
-      type: {{dataset_name}}
-    metrics:
-    - name: Memory Reduction
-      type: memory_efficiency
-      value: "~50%"
-    - name: Inference Speed
-      type: speed
-      value: "Faster"
-- name: {{model_name}} (int4 quantized)
-  results:
-  - task:
-      type: text-generation
-    dataset:
-      name: {{dataset_name}}
-      type: {{dataset_name}}
-    metrics:
-    - name: Memory Reduction
-      type: memory_efficiency
-      value: "~75%"
-    - name: Inference Speed
-      type: speed
-      value: "Significantly Faster"
-{{else}}
-model-index:
-- name: {{model_name}}
-  results:
-  - task:
-      type: text-generation
-    dataset:
-      name: {{dataset_name}}
-      type: {{dataset_name}}
-    metrics:
-    - name: Training Loss
-      type: loss
-      value: "{{training_loss|default:'N/A'}}"
-    - name: Validation Loss
-      type: loss
-      value: "{{validation_loss|default:'N/A'}}"
-    - name: Perplexity
-      type: perplexity
-      value: "{{perplexity|default:'N/A'}}"
-{{/if}}
 {{#if author_name}}
 author: {{author_name}}
 {{/if}}
-{{#if experiment_name}}
-experiment_name: {{experiment_name}}
-{{/if}}
-{{#if trackio_url}}
-trackio_url: {{trackio_url}}
-{{/if}}
-{{#if dataset_repo}}
-dataset_repo: {{dataset_repo}}
-{{/if}}
-{{#if hardware_info}}
-hardware: "{{hardware_info}}"
-{{/if}}
 {{#if training_config_type}}
 training_config: {{training_config_type}}
 {{/if}}
@@ -107,6 +23,9 @@ trainer_type: {{trainer_type}}
 {{#if batch_size}}
 batch_size: {{batch_size}}
 {{/if}}
 {{#if learning_rate}}
 learning_rate: {{learning_rate}}
 {{/if}}
@@ -116,17 +35,8 @@ max_epochs: {{max_epochs}}
 {{#if max_seq_length}}
 max_seq_length: {{max_seq_length}}
 {{/if}}
-{{#if dataset_sample_size}}
-dataset_sample_size: {{dataset_sample_size}}
-{{/if}}
-{{#if dataset_size}}
-dataset_size: {{dataset_size}}
-{{/if}}
-{{#if dataset_format}}
-dataset_format: {{dataset_format}}
-{{/if}}
-{{#if gradient_accumulation_steps}}
-gradient_accumulation_steps: {{gradient_accumulation_steps}}
 {{/if}}
 ---
@@ -134,210 +44,45 @@ gradient_accumulation_steps: {{gradient_accumulation_steps}}
 {{model_description}}
-## Model Details
-- **Base Model**: SmolLM3-3B
-- **Model Type**: Causal Language Model
-- **Languages**: English, French
-- **License**: Apache 2.0
-- **Fine-tuned**: Yes
-{{#if quantized_models}}
-- **Quantized Versions**: Available in subdirectories
-{{/if}}
 ## Usage
-### Main Model
 ```python
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-# Load the main model
-model = AutoModelForCausalLM.from_pretrained(
     "{{repo_name}}",
-    device_map="auto",
-    torch_dtype=torch.bfloat16
 )
-tokenizer = AutoTokenizer.from_pretrained("{{repo_name}}")
-# Generate text
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type)
-output = model.generate(**input_ids, max_new_tokens=50)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-## Training Information
-### Training Configuration
-- **Base Model**: {{base_model}}
-- **Dataset**: {{dataset_name}}
-- **Training Config**: {{training_config_type}}
-- **Trainer Type**: {{trainer_type}}
-{{#if dataset_sample_size}}
-- **Dataset Sample Size**: {{dataset_sample_size}}
-{{/if}}
-### Training Parameters
-- **Batch Size**: {{batch_size}}
-- **Gradient Accumulation**: {{gradient_accumulation_steps}}
-- **Learning Rate**: {{learning_rate}}
-- **Max Epochs**: {{max_epochs}}
-- **Sequence Length**: {{max_seq_length}}
-### Training Infrastructure
-- **Hardware**: {{hardware_info}}
-- **Monitoring**: Trackio integration
-- **Experiment**: {{experiment_name}}
-## Model Architecture
-This is a fine-tuned version of the SmolLM3-3B model with the following specifications:
-- **Base Model**: SmolLM3-3B
-- **Parameters**: ~3B
-- **Context Length**: {{max_seq_length}}
-- **Languages**: English, French
-- **Architecture**: Transformer-based causal language model
-## Performance
-The model provides:
-- **Text Generation**: High-quality text generation capabilities
-- **Conversation**: Natural conversation abilities
-- **Multilingual**: Support for English and French
-{{#if quantized_models}}
-- **Quantized Versions**: Optimized for different deployment scenarios
-{{/if}}
-## Limitations
-1. **Context Length**: Limited by the model's maximum sequence length
-2. **Bias**: May inherit biases from the training data
-3. **Factual Accuracy**: May generate incorrect or outdated information
-4. **Safety**: Should be used responsibly with appropriate safeguards
-{{#if quantized_models}}
-5. **Quantization**: Quantized versions may have slightly reduced accuracy
-{{/if}}
-## Training Data
-The model was fine-tuned on:
-- **Dataset**: {{dataset_name}}
-- **Size**: {{dataset_size}}
-- **Format**: {{dataset_format}}
-- **Languages**: English, French
-## Evaluation
-The model was evaluated using:
-- **Metrics**: Loss, perplexity, and qualitative assessment
-- **Monitoring**: Real-time tracking via Trackio
-- **Validation**: Regular validation during training
-## Citation
-If you use this model in your research, please cite:
-```bibtex
-@misc{{{model_name_slug}},
-  title={{{{model_name}}}},
-  author={{{author_name}}},
-  year={2024},
-  url={https://huggingface.co/{{repo_name}}}
-}
-```
-## License
-This model is licensed under the Apache 2.0 License.
-## Acknowledgments
-- **Base Model**: SmolLM3-3B by HuggingFaceTB
-- **Training Framework**: PyTorch, Transformers, PEFT
-- **Monitoring**: Trackio integration
-- **Quantization**: torchao library
-## Support
-For questions and support:
-- Open an issue on the Hugging Face repository
-- Check the model documentation
-- Review the training logs and configuration
-## Repository Structure
-```
-{{repo_name}}/
-├── README.md (this file)
-├── config.json
-├── pytorch_model.bin
-├── tokenizer.json
-└── tokenizer_config.json
 ```
-## Usage Examples
-### Text Generation
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model = AutoModelForCausalLM.from_pretrained("{{repo_name}}")
-tokenizer = AutoTokenizer.from_pretrained("{{repo_name}}")
-text = "The future of artificial intelligence is"
-inputs = tokenizer(text, return_tensors="pt")
-outputs = model.generate(**inputs, max_new_tokens=100)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-```
-### Conversation
-```python
-def chat_with_model(prompt, max_length=100):
-    inputs = tokenizer(prompt, return_tensors="pt")
-    outputs = model.generate(**inputs, max_new_tokens=max_length)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-response = chat_with_model("Hello, how are you today?")
-print(response)
-```
-### Advanced Usage
-```python
-# With generation parameters
-outputs = model.generate(
-    **inputs,
-    max_new_tokens=100,
-    temperature=0.7,
-    top_p=0.9,
-    do_sample=True,
-    pad_token_id=tokenizer.eos_token_id
-)
-```
-## Monitoring and Tracking
-This model was trained with comprehensive monitoring:
-- **Trackio Space**: {{trackio_url}}
-- **Experiment**: {{experiment_name}}
-- **Dataset Repository**: https://huggingface.co/datasets/{{dataset_repo}}
-- **Training Logs**: Available in the experiment data
-## Deployment
-### Requirements
-```bash
-pip install torch transformers accelerate
-{{#if quantized_models}}
-pip install torchao  # For quantized models
-{{/if}}
-```
-### Hardware Requirements
-- **Main Model**: GPU with 8GB+ VRAM recommended
-## Changelog
-- **v1.0.0**: Initial release with fine-tuned model

 ---
 license: apache-2.0
 tags:
 - voxtral
+- asr
+- speech-to-text
+- fine-tuning
+pipeline_tag: automatic-speech-recognition
 base_model: {{base_model}}
+{{#if has_hub_dataset_id}}
 datasets:
 - {{dataset_name}}
 {{/if}}
 {{#if author_name}}
 author: {{author_name}}
 {{/if}}
 {{#if training_config_type}}
 training_config: {{training_config_type}}
 {{/if}}
 {{#if batch_size}}
 batch_size: {{batch_size}}
 {{/if}}
+{{#if gradient_accumulation_steps}}
+gradient_accumulation_steps: {{gradient_accumulation_steps}}
+{{/if}}
 {{#if learning_rate}}
 learning_rate: {{learning_rate}}
 {{/if}}
 {{#if max_seq_length}}
 max_seq_length: {{max_seq_length}}
 {{/if}}
+{{#if hardware_info}}
+hardware: "{{hardware_info}}"
 {{/if}}
 ---
 {{model_description}}
 ## Usage
 ```python
 import torch
+from transformers import AutoProcessor, AutoModelForSeq2SeqLM
+import soundfile as sf
+processor = AutoProcessor.from_pretrained("{{repo_name}}")
+model = AutoModelForSeq2SeqLM.from_pretrained(
     "{{repo_name}}",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 )
+audio, sr = sf.read("sample.wav")
+inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
+with torch.no_grad():
+    generated_ids = model.generate(**inputs, max_new_tokens=256)
+text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(text)
 ```
+## Training Configuration
+- Base model: {{base_model}}
+{{#if training_config_type}}- Config: {{training_config_type}}{{/if}}
+{{#if trainer_type}}- Trainer: {{trainer_type}}{{/if}}
+## Training Parameters
+- Batch size: {{batch_size}}
+- Grad accumulation: {{gradient_accumulation_steps}}
+- Learning rate: {{learning_rate}}
+- Max epochs: {{max_epochs}}
+- Sequence length: {{max_seq_length}}
+## Hardware
+- {{hardware_info}}
+## Notes
+- This repository contains a fine-tuned Voxtral ASR model.