from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
class ToxicityDetector: | |
def __init__(self): | |
model_name = "unitary/toxic-bert" | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
self.model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
self.labels = [ | |
"toxicity", "severe_toxicity", "obscene", "threat", | |
"insult", "identity_attack", "sexual_explicit" | |
] | |
def detect(self, prompt): | |
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True) | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
scores = torch.sigmoid(outputs.logits).squeeze().tolist() | |
results = [ | |
{"label": label, "score": round(score, 3)} | |
for label, score in zip(self.labels, scores) | |
if score > 0.3 | |
] | |
return { | |
"label": "Toxic" if results else "Safe", | |
"details": results | |
} | |