import datasets import evaluate import pandas as pd import numpy as np from datasets import Dataset from sklearn.model_selection import train_test_split from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer) model_name = "DeepPavlov/rubert-base-cased" # Login using e.g. `huggingface-cli login` to access this dataset splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'} df = pd.read_parquet("hf://datasets/mteb/RuSciBenchOECDClassification/" + splits["train"]) # Конвертируем датафрейм в Dataset train, test = train_test_split(df, test_size=0.2) train = Dataset.from_pandas(train) test = Dataset.from_pandas(test) # Выполняем предобработку текста tokenizer = AutoTokenizer.from_pretrained(model_name) def tokenize_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True) tokenized_train = train.map(tokenize_function) tokenized_test = test.map(tokenize_function) # Загружаем предобученную модель model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=28) # Задаем параметры обучения training_args = TrainingArguments( output_dir='test_trainer_log', evaluate_during_training=True, per_device_train_batch_size=6, per_device_eval_batch_size=6, num_train_epochs=5, report_to='none' ) # Определяем как считать метрику metric = evaluate.load('f1') def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) # Выполняем обучение trainer = Trainer( model = model, args = training_args, train_dataset = tokenized_train, eval_dataset = tokenized_test, compute_metrics = compute_metrics) trainer.train() # Сохраняем модель save_directory = './pt_save_pretrained' #tokenizer.save_pretrained(save_directory) model.save_pretrained(save_directory) #alternatively save the trainer #trainer.save_model('CustomModels/CustomHamSpam')