Spaces:
Runtime error
Runtime error
import datasets | |
import evaluate | |
import pandas as pd | |
import numpy as np | |
from datasets import Dataset | |
from sklearn.model_selection import train_test_split | |
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, | |
TrainingArguments, Trainer) | |
model_name = "DeepPavlov/rubert-base-cased" | |
# Login using e.g. `huggingface-cli login` to access this dataset | |
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'} | |
df = pd.read_parquet("hf://datasets/mteb/RuSciBenchOECDClassification/" + splits["train"]) | |
# Конвертируем датафрейм в Dataset | |
train, test = train_test_split(df, test_size=0.2) | |
train = Dataset.from_pandas(train) | |
test = Dataset.from_pandas(test) | |
# Выполняем предобработку текста | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def tokenize_function(examples): | |
return tokenizer(examples['text'], padding='max_length', truncation=True) | |
tokenized_train = train.map(tokenize_function) | |
tokenized_test = test.map(tokenize_function) | |
# Загружаем предобученную модель | |
model = AutoModelForSequenceClassification.from_pretrained( | |
model_name, | |
num_labels=28) | |
# Задаем параметры обучения | |
training_args = TrainingArguments( | |
output_dir='test_trainer_log', | |
evaluate_during_training=True, | |
per_device_train_batch_size=6, | |
per_device_eval_batch_size=6, | |
num_train_epochs=5, | |
report_to='none' | |
) | |
# Определяем как считать метрику | |
metric = evaluate.load('f1') | |
def compute_metrics(eval_pred): | |
logits, labels = eval_pred | |
predictions = np.argmax(logits, axis=-1) | |
return metric.compute(predictions=predictions, references=labels) | |
# Выполняем обучение | |
trainer = Trainer( | |
model = model, | |
args = training_args, | |
train_dataset = tokenized_train, | |
eval_dataset = tokenized_test, | |
compute_metrics = compute_metrics) | |
trainer.train() | |
# Сохраняем модель | |
save_directory = './pt_save_pretrained' | |
#tokenizer.save_pretrained(save_directory) | |
model.save_pretrained(save_directory) | |
#alternatively save the trainer | |
#trainer.save_model('CustomModels/CustomHamSpam') |