Spaces:
Sleeping
Sleeping
from transformers import ( | |
AutoTokenizer, | |
AutoModelForSequenceClassification, | |
Trainer, | |
TrainingArguments, | |
) | |
from config import Config | |
import json | |
from datasets import Dataset | |
import torch | |
class QuestionClassifier: | |
def __init__( | |
self, model_name="distilbert-base-multilingual-cased", initialized_train=True | |
): | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
self.model_name = model_name | |
self.category2id = None | |
self.category2id = None | |
if initialized_train: | |
self.train() | |
def train(self, json_path=Config.EXMAPLES_JSON, num_epochs=3): | |
# * Cargar ejemplos | |
with open(json_path, "r", encoding="utf-8") as f: | |
examples = json.load(f) | |
texts, labels, category2id = self._prepare_supervised_data(examples) | |
self.category2id = category2id | |
self.id2category = {value: key for key, value in category2id.items()} | |
self.model = AutoModelForSequenceClassification.from_pretrained( | |
self.model_name, num_labels=len(category2id) | |
) | |
encodings = self.tokenizer(texts, truncation=True, padding=True) | |
dataset = Dataset.from_dict( | |
{ | |
"input_ids": encodings["input_ids"], | |
"attention_mask": encodings["attention_mask"], | |
"labels": labels, | |
} | |
) | |
training_args = TrainingArguments( | |
output_dir="./results", | |
per_device_train_batch_size=8, | |
num_train_epochs=num_epochs, | |
logging_steps=1, | |
# logging_strategy="steps", | |
report_to="none", | |
save_strategy="no", | |
remove_unused_columns=False, | |
eval_strategy="no", | |
) | |
# 4. Trainer | |
trainer = Trainer(model=self.model, args=training_args, train_dataset=dataset) | |
trainer.train() | |
def _prepare_supervised_data(self, examples): | |
category2id = {cat: i for i, cat in enumerate(examples.keys())} | |
texts = [] | |
labels = [] | |
for category, items in examples.items(): | |
for item in items: | |
texts.append(item["pregunta"]) | |
labels.append(category2id[category]) | |
return texts, labels, category2id | |
def predict(self, question: str): | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.model.to(device) | |
inputs = self.tokenizer( | |
question, return_tensors="pt", truncation=True, padding=True | |
) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
predicted_class_id = outputs.logits.argmax().item() | |
return self.id2category[predicted_class_id] | |
# * FORMA DE USARSE | |
# qc = QuestionClassifier() | |
# qc.train() | |
# categoria = qc.predict("Dame los productos más vendidos") | |
# print(categoria) # → 'PRODUCTOS' | |