Spaces:
Running
Running
fixes data collation issue
Browse files
data.py
CHANGED
|
@@ -261,6 +261,8 @@ class SmolLM3Dataset:
|
|
| 261 |
return DataCollatorForLanguageModeling(
|
| 262 |
tokenizer=self.tokenizer,
|
| 263 |
mlm=False, # We're doing causal LM, not masked LM
|
|
|
|
|
|
|
| 264 |
)
|
| 265 |
|
| 266 |
def create_sample_dataset(output_path: str = "my_dataset"):
|
|
|
|
| 261 |
return DataCollatorForLanguageModeling(
|
| 262 |
tokenizer=self.tokenizer,
|
| 263 |
mlm=False, # We're doing causal LM, not masked LM
|
| 264 |
+
pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
|
| 265 |
+
return_tensors="pt", # Ensure we return PyTorch tensors
|
| 266 |
)
|
| 267 |
|
| 268 |
def create_sample_dataset(output_path: str = "my_dataset"):
|