Joseph Pollack commited on
Commit
9f8cd0c
·
unverified ·
1 Parent(s): b82e5c5

adds model card flow

Browse files
scripts/push_to_huggingface.py CHANGED
@@ -294,7 +294,11 @@ class HuggingFacePusher:
294
 
295
  # Create variables for the model card
296
  variables = create_default_variables()
297
-
 
 
 
 
298
  # Update with actual values
299
  variables.update({
300
  "repo_name": self.repo_id,
@@ -305,7 +309,10 @@ class HuggingFacePusher:
305
  "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
306
  "training_config_type": self.training_config_type or "Custom Configuration",
307
  "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
308
- "dataset_name": self.dataset_name or "Custom Dataset",
 
 
 
309
  "trainer_type": self.trainer_type or "SFTTrainer",
310
  "batch_size": str(self.batch_size) if self.batch_size else "8",
311
  "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"),
@@ -576,7 +583,7 @@ MIT License
576
  # Create and upload model card
577
  model_card = self.create_model_card(training_config, results)
578
  model_card_path = Path("temp_model_card.md")
579
- with open(model_card_path, "w") as f:
580
  f.write(model_card)
581
 
582
  try:
@@ -779,7 +786,7 @@ This dataset is created for research and educational purposes.
779
 
780
  # Upload README
781
  readme_path = dataset_file.parent / "README.md"
782
- with open(readme_path, "w") as f:
783
  f.write(readme_content)
784
 
785
  upload_file(
 
294
 
295
  # Create variables for the model card
296
  variables = create_default_variables()
297
+
298
+ # Determine whether dataset_name looks like a valid Hub dataset id (owner/dataset)
299
+ hub_dataset = (self.dataset_name or "").strip()
300
+ has_hub_dataset_id = bool(hub_dataset and "/" in hub_dataset and " " not in hub_dataset and len(hub_dataset.split("/")) == 2)
301
+
302
  # Update with actual values
303
  variables.update({
304
  "repo_name": self.repo_id,
 
309
  "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
310
  "training_config_type": self.training_config_type or "Custom Configuration",
311
  "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
312
+ "dataset_name": hub_dataset if hub_dataset else "",
313
+ "has_hub_dataset_id": has_hub_dataset_id,
314
+ # Only include model-index when a dataset is provided or when metrics are meaningful
315
+ "include_model_index": bool(hub_dataset),
316
  "trainer_type": self.trainer_type or "SFTTrainer",
317
  "batch_size": str(self.batch_size) if self.batch_size else "8",
318
  "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"),
 
583
  # Create and upload model card
584
  model_card = self.create_model_card(training_config, results)
585
  model_card_path = Path("temp_model_card.md")
586
+ with open(model_card_path, "w", encoding="utf-8") as f:
587
  f.write(model_card)
588
 
589
  try:
 
786
 
787
  # Upload README
788
  readme_path = dataset_file.parent / "README.md"
789
+ with open(readme_path, "w", encoding="utf-8") as f:
790
  f.write(readme_content)
791
 
792
  upload_file(
templates/model_card.md CHANGED
@@ -1,103 +1,19 @@
1
  ---
2
- language:
3
- - en
4
- - fr
5
  license: apache-2.0
6
- library_name: transformers
7
  tags:
8
  - voxtral
9
- - fine-tuned
10
- - text-generation
11
- - tonic
12
- {{#if quantized_models}}- quantized{{/if}}
13
- pipeline_tag: text-generation
14
  base_model: {{base_model}}
15
- {{#if dataset_name}}
16
  datasets:
17
  - {{dataset_name}}
18
  {{/if}}
19
- {{#if quantized_models}}
20
- model-index:
21
- - name: {{model_name}}
22
- results:
23
- - task:
24
- type: text-generation
25
- dataset:
26
- name: {{dataset_name}}
27
- type: {{dataset_name}}
28
- metrics:
29
- - name: Training Loss
30
- type: loss
31
- value: "{{training_loss|default:'N/A'}}"
32
- - name: Validation Loss
33
- type: loss
34
- value: "{{validation_loss|default:'N/A'}}"
35
- - name: Perplexity
36
- type: perplexity
37
- value: "{{perplexity|default:'N/A'}}"
38
- - name: {{model_name}} (int8 quantized)
39
- results:
40
- - task:
41
- type: text-generation
42
- dataset:
43
- name: {{dataset_name}}
44
- type: {{dataset_name}}
45
- metrics:
46
- - name: Memory Reduction
47
- type: memory_efficiency
48
- value: "~50%"
49
- - name: Inference Speed
50
- type: speed
51
- value: "Faster"
52
- - name: {{model_name}} (int4 quantized)
53
- results:
54
- - task:
55
- type: text-generation
56
- dataset:
57
- name: {{dataset_name}}
58
- type: {{dataset_name}}
59
- metrics:
60
- - name: Memory Reduction
61
- type: memory_efficiency
62
- value: "~75%"
63
- - name: Inference Speed
64
- type: speed
65
- value: "Significantly Faster"
66
- {{else}}
67
- model-index:
68
- - name: {{model_name}}
69
- results:
70
- - task:
71
- type: text-generation
72
- dataset:
73
- name: {{dataset_name}}
74
- type: {{dataset_name}}
75
- metrics:
76
- - name: Training Loss
77
- type: loss
78
- value: "{{training_loss|default:'N/A'}}"
79
- - name: Validation Loss
80
- type: loss
81
- value: "{{validation_loss|default:'N/A'}}"
82
- - name: Perplexity
83
- type: perplexity
84
- value: "{{perplexity|default:'N/A'}}"
85
- {{/if}}
86
  {{#if author_name}}
87
  author: {{author_name}}
88
  {{/if}}
89
- {{#if experiment_name}}
90
- experiment_name: {{experiment_name}}
91
- {{/if}}
92
- {{#if trackio_url}}
93
- trackio_url: {{trackio_url}}
94
- {{/if}}
95
- {{#if dataset_repo}}
96
- dataset_repo: {{dataset_repo}}
97
- {{/if}}
98
- {{#if hardware_info}}
99
- hardware: "{{hardware_info}}"
100
- {{/if}}
101
  {{#if training_config_type}}
102
  training_config: {{training_config_type}}
103
  {{/if}}
@@ -107,6 +23,9 @@ trainer_type: {{trainer_type}}
107
  {{#if batch_size}}
108
  batch_size: {{batch_size}}
109
  {{/if}}
 
 
 
110
  {{#if learning_rate}}
111
  learning_rate: {{learning_rate}}
112
  {{/if}}
@@ -116,17 +35,8 @@ max_epochs: {{max_epochs}}
116
  {{#if max_seq_length}}
117
  max_seq_length: {{max_seq_length}}
118
  {{/if}}
119
- {{#if dataset_sample_size}}
120
- dataset_sample_size: {{dataset_sample_size}}
121
- {{/if}}
122
- {{#if dataset_size}}
123
- dataset_size: {{dataset_size}}
124
- {{/if}}
125
- {{#if dataset_format}}
126
- dataset_format: {{dataset_format}}
127
- {{/if}}
128
- {{#if gradient_accumulation_steps}}
129
- gradient_accumulation_steps: {{gradient_accumulation_steps}}
130
  {{/if}}
131
  ---
132
 
@@ -134,210 +44,45 @@ gradient_accumulation_steps: {{gradient_accumulation_steps}}
134
 
135
  {{model_description}}
136
 
137
- ## Model Details
138
-
139
- - **Base Model**: SmolLM3-3B
140
- - **Model Type**: Causal Language Model
141
- - **Languages**: English, French
142
- - **License**: Apache 2.0
143
- - **Fine-tuned**: Yes
144
- {{#if quantized_models}}
145
- - **Quantized Versions**: Available in subdirectories
146
- {{/if}}
147
-
148
  ## Usage
149
 
150
- ### Main Model
151
-
152
  ```python
153
  import torch
154
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
155
 
156
- # Load the main model
157
- model = AutoModelForCausalLM.from_pretrained(
158
  "{{repo_name}}",
159
- device_map="auto",
160
- torch_dtype=torch.bfloat16
161
  )
162
- tokenizer = AutoTokenizer.from_pretrained("{{repo_name}}")
163
-
164
- # Generate text
165
- input_text = "What are we having for dinner?"
166
- input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type)
167
- output = model.generate(**input_ids, max_new_tokens=50)
168
- print(tokenizer.decode(output[0], skip_special_tokens=True))
169
- ```
170
-
171
- ## Training Information
172
-
173
- ### Training Configuration
174
- - **Base Model**: {{base_model}}
175
- - **Dataset**: {{dataset_name}}
176
- - **Training Config**: {{training_config_type}}
177
- - **Trainer Type**: {{trainer_type}}
178
- {{#if dataset_sample_size}}
179
- - **Dataset Sample Size**: {{dataset_sample_size}}
180
- {{/if}}
181
-
182
- ### Training Parameters
183
- - **Batch Size**: {{batch_size}}
184
- - **Gradient Accumulation**: {{gradient_accumulation_steps}}
185
- - **Learning Rate**: {{learning_rate}}
186
- - **Max Epochs**: {{max_epochs}}
187
- - **Sequence Length**: {{max_seq_length}}
188
-
189
- ### Training Infrastructure
190
- - **Hardware**: {{hardware_info}}
191
- - **Monitoring**: Trackio integration
192
- - **Experiment**: {{experiment_name}}
193
-
194
- ## Model Architecture
195
-
196
- This is a fine-tuned version of the SmolLM3-3B model with the following specifications:
197
-
198
- - **Base Model**: SmolLM3-3B
199
- - **Parameters**: ~3B
200
- - **Context Length**: {{max_seq_length}}
201
- - **Languages**: English, French
202
- - **Architecture**: Transformer-based causal language model
203
-
204
- ## Performance
205
-
206
- The model provides:
207
- - **Text Generation**: High-quality text generation capabilities
208
- - **Conversation**: Natural conversation abilities
209
- - **Multilingual**: Support for English and French
210
- {{#if quantized_models}}
211
- - **Quantized Versions**: Optimized for different deployment scenarios
212
- {{/if}}
213
 
214
- ## Limitations
215
-
216
- 1. **Context Length**: Limited by the model's maximum sequence length
217
- 2. **Bias**: May inherit biases from the training data
218
- 3. **Factual Accuracy**: May generate incorrect or outdated information
219
- 4. **Safety**: Should be used responsibly with appropriate safeguards
220
- {{#if quantized_models}}
221
- 5. **Quantization**: Quantized versions may have slightly reduced accuracy
222
- {{/if}}
223
-
224
- ## Training Data
225
-
226
- The model was fine-tuned on:
227
- - **Dataset**: {{dataset_name}}
228
- - **Size**: {{dataset_size}}
229
- - **Format**: {{dataset_format}}
230
- - **Languages**: English, French
231
-
232
- ## Evaluation
233
-
234
- The model was evaluated using:
235
- - **Metrics**: Loss, perplexity, and qualitative assessment
236
- - **Monitoring**: Real-time tracking via Trackio
237
- - **Validation**: Regular validation during training
238
-
239
- ## Citation
240
-
241
- If you use this model in your research, please cite:
242
-
243
- ```bibtex
244
- @misc{{{model_name_slug}},
245
- title={{{{model_name}}}},
246
- author={{{author_name}}},
247
- year={2024},
248
- url={https://huggingface.co/{{repo_name}}}
249
- }
250
- ```
251
-
252
- ## License
253
-
254
- This model is licensed under the Apache 2.0 License.
255
-
256
- ## Acknowledgments
257
-
258
- - **Base Model**: SmolLM3-3B by HuggingFaceTB
259
- - **Training Framework**: PyTorch, Transformers, PEFT
260
- - **Monitoring**: Trackio integration
261
- - **Quantization**: torchao library
262
-
263
- ## Support
264
-
265
- For questions and support:
266
- - Open an issue on the Hugging Face repository
267
- - Check the model documentation
268
- - Review the training logs and configuration
269
-
270
- ## Repository Structure
271
-
272
- ```
273
- {{repo_name}}/
274
- ├── README.md (this file)
275
- ├── config.json
276
- ├── pytorch_model.bin
277
- ├── tokenizer.json
278
- └── tokenizer_config.json
279
  ```
280
 
281
- ## Usage Examples
282
 
283
- ### Text Generation
284
- ```python
285
- from transformers import AutoModelForCausalLM, AutoTokenizer
286
 
287
- model = AutoModelForCausalLM.from_pretrained("{{repo_name}}")
288
- tokenizer = AutoTokenizer.from_pretrained("{{repo_name}}")
289
 
290
- text = "The future of artificial intelligence is"
291
- inputs = tokenizer(text, return_tensors="pt")
292
- outputs = model.generate(**inputs, max_new_tokens=100)
293
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
294
- ```
295
 
296
- ### Conversation
297
- ```python
298
- def chat_with_model(prompt, max_length=100):
299
- inputs = tokenizer(prompt, return_tensors="pt")
300
- outputs = model.generate(**inputs, max_new_tokens=max_length)
301
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
302
-
303
- response = chat_with_model("Hello, how are you today?")
304
- print(response)
305
- ```
306
-
307
- ### Advanced Usage
308
- ```python
309
- # With generation parameters
310
- outputs = model.generate(
311
- **inputs,
312
- max_new_tokens=100,
313
- temperature=0.7,
314
- top_p=0.9,
315
- do_sample=True,
316
- pad_token_id=tokenizer.eos_token_id
317
- )
318
- ```
319
-
320
- ## Monitoring and Tracking
321
-
322
- This model was trained with comprehensive monitoring:
323
- - **Trackio Space**: {{trackio_url}}
324
- - **Experiment**: {{experiment_name}}
325
- - **Dataset Repository**: https://huggingface.co/datasets/{{dataset_repo}}
326
- - **Training Logs**: Available in the experiment data
327
-
328
- ## Deployment
329
-
330
- ### Requirements
331
- ```bash
332
- pip install torch transformers accelerate
333
- {{#if quantized_models}}
334
- pip install torchao # For quantized models
335
- {{/if}}
336
- ```
337
 
338
- ### Hardware Requirements
339
- - **Main Model**: GPU with 8GB+ VRAM recommended
340
 
341
- ## Changelog
342
 
343
- - **v1.0.0**: Initial release with fine-tuned model
 
1
  ---
 
 
 
2
  license: apache-2.0
 
3
  tags:
4
  - voxtral
5
+ - asr
6
+ - speech-to-text
7
+ - fine-tuning
8
+ pipeline_tag: automatic-speech-recognition
 
9
  base_model: {{base_model}}
10
+ {{#if has_hub_dataset_id}}
11
  datasets:
12
  - {{dataset_name}}
13
  {{/if}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  {{#if author_name}}
15
  author: {{author_name}}
16
  {{/if}}
 
 
 
 
 
 
 
 
 
 
 
 
17
  {{#if training_config_type}}
18
  training_config: {{training_config_type}}
19
  {{/if}}
 
23
  {{#if batch_size}}
24
  batch_size: {{batch_size}}
25
  {{/if}}
26
+ {{#if gradient_accumulation_steps}}
27
+ gradient_accumulation_steps: {{gradient_accumulation_steps}}
28
+ {{/if}}
29
  {{#if learning_rate}}
30
  learning_rate: {{learning_rate}}
31
  {{/if}}
 
35
  {{#if max_seq_length}}
36
  max_seq_length: {{max_seq_length}}
37
  {{/if}}
38
+ {{#if hardware_info}}
39
+ hardware: "{{hardware_info}}"
 
 
 
 
 
 
 
 
 
40
  {{/if}}
41
  ---
42
 
 
44
 
45
  {{model_description}}
46
 
 
 
 
 
 
 
 
 
 
 
 
47
  ## Usage
48
 
 
 
49
  ```python
50
  import torch
51
+ from transformers import AutoProcessor, AutoModelForSeq2SeqLM
52
+ import soundfile as sf
53
 
54
+ processor = AutoProcessor.from_pretrained("{{repo_name}}")
55
+ model = AutoModelForSeq2SeqLM.from_pretrained(
56
  "{{repo_name}}",
57
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 
58
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ audio, sr = sf.read("sample.wav")
61
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
62
+ with torch.no_grad():
63
+ generated_ids = model.generate(**inputs, max_new_tokens=256)
64
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
65
+ print(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ```
67
 
68
+ ## Training Configuration
69
 
70
+ - Base model: {{base_model}}
71
+ {{#if training_config_type}}- Config: {{training_config_type}}{{/if}}
72
+ {{#if trainer_type}}- Trainer: {{trainer_type}}{{/if}}
73
 
74
+ ## Training Parameters
 
75
 
76
+ - Batch size: {{batch_size}}
77
+ - Grad accumulation: {{gradient_accumulation_steps}}
78
+ - Learning rate: {{learning_rate}}
79
+ - Max epochs: {{max_epochs}}
80
+ - Sequence length: {{max_seq_length}}
81
 
82
+ ## Hardware
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ - {{hardware_info}}
 
85
 
86
+ ## Notes
87
 
88
+ - This repository contains a fine-tuned Voxtral ASR model.