han7ter commited on
Commit
bc36b2e
·
1 Parent(s): d6abe64

Change dataset

Browse files
Files changed (1) hide show
  1. app.py +3 -4
app.py CHANGED
@@ -10,9 +10,8 @@ from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
10
  model_name = "cointegrated/rubert-tiny2"
11
 
12
  # Login using e.g. `huggingface-cli login` to access this dataset
13
- splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
14
- df = pd.read_parquet("hf://datasets/mteb/RuSciBenchOECDClassification/" + splits["train"])
15
- df = df.head(500)
16
 
17
  # Конвертируем датафрейм в Dataset
18
  train, test = train_test_split(df, test_size=0.2)
@@ -31,7 +30,7 @@ tokenized_test = test.map(tokenize_function)
31
  # Загружаем предобученную модель
32
  model = AutoModelForSequenceClassification.from_pretrained(
33
  model_name,
34
- num_labels=29)
35
 
36
  # Задаем параметры обучения
37
  training_args = TrainingArguments(
 
10
  model_name = "cointegrated/rubert-tiny2"
11
 
12
  # Login using e.g. `huggingface-cli login` to access this dataset
13
+ splits = {'train': 'train.json', 'test': 'test.json'}
14
+ df = pd.read_json("hf://datasets/Den4ikAI/gibberish_dataset/" + splits["train"])
 
15
 
16
  # Конвертируем датафрейм в Dataset
17
  train, test = train_test_split(df, test_size=0.2)
 
30
  # Загружаем предобученную модель
31
  model = AutoModelForSequenceClassification.from_pretrained(
32
  model_name,
33
+ num_labels=4)
34
 
35
  # Задаем параметры обучения
36
  training_args = TrainingArguments(