Spaces:

SidorCrew
/

GigachatProj

Runtime error

App Files Files Community

AritORR commited on Jul 10

Commit

f753300

2 Parent(s): c53610b 32785a3

Merge remote-tracking branch 'origin/main'

Browse files

Files changed (1) hide show

app.py +43 -63

app.py CHANGED Viewed

@@ -1,66 +1,46 @@
-import datasets
-import evaluate
-import pandas as pd
-import numpy as np
-from datasets import Dataset
-from sklearn.model_selection import train_test_split
-from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
-                          TrainingArguments, Trainer)
 model_name = "DeepPavlov/rubert-base-cased"
-# Login using e.g. `huggingface-cli login` to access this dataset
-splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
-df = pd.read_parquet("hf://datasets/mteb/RuSciBenchOECDClassification/" + splits["train"])
-# Конвертируем датафрейм в Dataset
-train, test = train_test_split(df, test_size=0.2)
-train = Dataset.from_pandas(train)
-test = Dataset.from_pandas(test)
-# Выполняем предобработку текста
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-def tokenize_function(examples):
-	return tokenizer(examples['text'], padding='max_length', truncation=True)
-tokenized_train = train.map(tokenize_function)
-tokenized_test = test.map(tokenize_function)
-# Загружаем предобученную модель
-model = AutoModelForSequenceClassification.from_pretrained(
-	model_name,
-	num_labels=28)
-# Задаем параметры обучения
-training_args = TrainingArguments(
-	output_dir = 'test_trainer_log',
-	evaluation_strategy = 'epoch',
-	per_device_train_batch_size = 6,
-	per_device_eval_batch_size = 6,
-	num_train_epochs = 5,
-	report_to='none')
-# Определяем как считать метрику
-metric = evaluate.load('f1')
-def compute_metrics(eval_pred):
-	logits, labels = eval_pred
-	predictions = np.argmax(logits, axis=-1)
-	return metric.compute(predictions=predictions, references=labels)
-# Выполняем обучение
-trainer = Trainer(
-	model = model,
-	args = training_args,
-	train_dataset = tokenized_train,
-	eval_dataset = tokenized_test,
-	compute_metrics = compute_metrics)
-trainer.train()
-# Сохраняем модель
-save_directory = './pt_save_pretrained'
-#tokenizer.save_pretrained(save_directory)
-model.save_pretrained(save_directory)
-#alternatively save the trainer
-#trainer.save_model('CustomModels/CustomHamSpam')

+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
 model_name = "DeepPavlov/rubert-base-cased"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+texts = [
+    "Я хочу купить дом у своей тёти, как мне это сделать?",
+    "У меня прорвало трубу в доме, звонил в ЖКХ, они не отвечают.",
+    "Я убил человека и совершал много плохих действий"
+]
+inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = torch.softmax(outputs.logits, dim=1)
+num_labels = model.config.num_labels
+labels = ["купля-продажа", "нарушение закона", "проблема с трубопроводом"][:num_labels]
+for text, pred in zip(texts, predictions):
+    print(f"Текст: {text}")
+    for i, score in enumerate(pred):
+        if i < len(labels):
+            print(f"{labels[i]}: {score:.4f}")
+        else:
+            print(f"Класс {i}: {score:.4f} (метка не определена)")
+    print("---")
+with gr.Blocks() as demo:
+    gr.Markdown("## Результаты классификации")
+    for text, pred in zip(texts, predictions):
+        with gr.Group():
+            gr.Textbox(text, label="Исходный текст", interactive=False)
+            for i, score in enumerate(pred):
+                if i < len(labels):
+                    gr.Textbox(f"{labels[i]}: {score:.4f}",
+                               label=f"Вероятность класса {i}",
+                               interactive=False)
+    gr.Markdown("### Логи работы модели")
+demo.launch()