| ```python | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from datasets import Dataset | |
| device = "cuda" | |
| path = "Unbabel/mfineweb-edu-classifier" | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| path, | |
| device_map=device, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True) | |
| def tokenize(examples): | |
| return tokenizer(examples["text"], truncation=True, max_length=512) | |
| texts = [ | |
| "This is a text", | |
| "this is another text to classify" | |
| ] | |
| model_inputs = [ | |
| tokenizer(text, truncation=True, max_length=512) for text in texts | |
| ] | |
| with torch.no_grad(): | |
| for model_input in model_inputs: | |
| output = model(input_ids) | |
| ``` |