Files changed (2) hide show
  1. app.py +5 -74
  2. requirements.txt +1 -6
app.py CHANGED
@@ -1,76 +1,7 @@
1
- import datasets
2
- import evaluate
3
- import os
4
- import pandas as pd
5
- import numpy as np
6
- from datasets import Dataset
7
- from sklearn.model_selection import train_test_split
8
- from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
9
- TrainingArguments, Trainer)
10
 
11
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
12
 
13
- model_name = "cointegrated/rubert-tiny2"
14
-
15
- # Login using e.g. `huggingface-cli login` to access this dataset
16
- splits = {'train': 'train.json', 'test': 'test.json'}
17
- df = pd.read_json("hf://datasets/Den4ikAI/gibberish_dataset/" + splits["train"])
18
- df = df.head(500)
19
-
20
- # Конвертируем датафрейм в Dataset
21
- train, test = train_test_split(df, test_size=0.2)
22
- train = Dataset.from_pandas(train)
23
- test = Dataset.from_pandas(test)
24
-
25
- # Выполняем предобработку текста
26
- tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=400)
27
-
28
- def tokenize_function(examples):
29
- return tokenizer(examples['text'], padding='max_length', truncation=True)
30
-
31
- tokenized_train = train.map(tokenize_function)
32
- tokenized_test = test.map(tokenize_function)
33
-
34
- # Загружаем предобученную модель
35
- model = AutoModelForSequenceClassification.from_pretrained(
36
- model_name,
37
- num_labels=4)
38
-
39
- model.to("cpu")
40
-
41
- # Задаем параметры обучения
42
- training_args = TrainingArguments(
43
- output_dir='test_trainer_log',
44
- eval_strategy='epoch',
45
- per_device_train_batch_size=6,
46
- per_device_eval_batch_size=6,
47
- num_train_epochs=5,
48
- report_to='none'
49
- )
50
-
51
- metric = evaluate.load('f1')
52
- def compute_metrics(eval_pred):
53
- logits, labels = eval_pred
54
- predictions = np.argmax(logits, axis=-1)
55
- return metric.compute(
56
- predictions=predictions,
57
- references=labels,
58
- average='micro'
59
- )
60
-
61
- # Выполняем обучение
62
- trainer = Trainer(
63
- model = model,
64
- args = training_args,
65
- train_dataset = tokenized_train,
66
- eval_dataset = tokenized_test,
67
- compute_metrics = compute_metrics)
68
-
69
- trainer.train()
70
-
71
- # Сохраняем модель
72
- save_directory = './pt_save_pretrained'
73
- #tokenizer.save_pretrained(save_directory)
74
- model.save_pretrained(save_directory)
75
- #alternatively save the trainer
76
- #trainer.save_model('CustomModels/CustomHamSpam')
 
1
+ import gradio as gr
 
 
 
 
 
 
 
 
2
 
3
+ def greet(name):
4
+ return "Hello " + name + "!!"
5
 
6
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,4 @@
1
  transformers
2
  torch
3
  accelerate
4
- bitsandbytes
5
- datasets
6
- evaluate
7
- pandas
8
- numpy
9
- scikit-learn
 
1
  transformers
2
  torch
3
  accelerate
4
+ bitsandbytes