Spaces:
Runtime error
Runtime error
| import datasets | |
| import evaluate | |
| import pandas as pd | |
| import numpy as np | |
| from datasets import Dataset | |
| from sklearn.model_selection import train_test_split | |
| from transformers import (AutoTokenizer, AutoModelForSequenceClassification, | |
| TrainingArguments, Trainer) | |
| model_name = "cointegrated/rubert-tiny2" | |
| # Login using e.g. `huggingface-cli login` to access this dataset | |
| splits = {'train': 'train.json', 'test': 'test.json'} | |
| df = pd.read_json("hf://datasets/Den4ikAI/gibberish_dataset/" + splits["train"]) | |
| # Конвертируем датафрейм в Dataset | |
| train, test = train_test_split(df, test_size=0.2) | |
| train = Dataset.from_pandas(train) | |
| test = Dataset.from_pandas(test) | |
| # Выполняем предобработку текста | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512) | |
| def tokenize_function(examples): | |
| return tokenizer(examples['text'], padding='max_length', truncation=True) | |
| tokenized_train = train.map(tokenize_function) | |
| tokenized_test = test.map(tokenize_function) | |
| # Загружаем предобученную модель | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_name, | |
| num_labels=4) | |
| # Задаем параметры обучения | |
| training_args = TrainingArguments( | |
| output_dir='test_trainer_log', | |
| eval_strategy='epoch', | |
| per_device_train_batch_size=6, | |
| per_device_eval_batch_size=6, | |
| num_train_epochs=5, | |
| report_to='none' | |
| ) | |
| metric = evaluate.load('f1') | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| return metric.compute( | |
| predictions=predictions, | |
| references=labels, | |
| average='micro' | |
| ) | |
| # Выполняем обучение | |
| trainer = Trainer( | |
| model = model, | |
| args = training_args, | |
| train_dataset = tokenized_train, | |
| eval_dataset = tokenized_test, | |
| compute_metrics = compute_metrics) | |
| trainer.train() | |
| # Сохраняем модель | |
| save_directory = './pt_save_pretrained' | |
| #tokenizer.save_pretrained(save_directory) | |
| model.save_pretrained(save_directory) | |
| #alternatively save the trainer | |
| #trainer.save_model('CustomModels/CustomHamSpam') |