|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import os |
|
|
|
class MinosRefusalClassifier: |
|
def __init__(self, model_path_or_name="NousResearch/Minos-v1"): |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {self.device}") |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name) |
|
self.model = AutoModelForSequenceClassification.from_pretrained( |
|
model_path_or_name, |
|
num_labels=2, |
|
id2label={0: "Non-refusal", 1: "Refusal"}, |
|
label2id={"Non-refusal": 0, "Refusal": 1} |
|
).to(self.device) |
|
|
|
self.model.eval() |
|
print("Model loaded successfully") |
|
|
|
def predict_multi_turn(self, conversation_turns): |
|
""" |
|
Process multiple conversation turns |
|
|
|
Args: |
|
conversation_turns: List of dictionaries, each with 'user' and 'assistant' keys |
|
|
|
Returns: |
|
Dictionary with prediction results |
|
""" |
|
|
|
formatted_text = "" |
|
for i, turn in enumerate(conversation_turns): |
|
formatted_text += f"<|user|>\n{turn['user']}\n<|assistant|>\n{turn['assistant']}" |
|
if i < len(conversation_turns) - 1: |
|
formatted_text += "\n" |
|
|
|
inputs = self.tokenizer( |
|
formatted_text, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=8192, |
|
padding=True |
|
).to(self.device) |
|
|
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
prediction = torch.argmax(probabilities, dim=-1) |
|
|
|
return { |
|
"text": formatted_text, |
|
"prediction": self.model.config.id2label[prediction.item()], |
|
"confidence": probabilities[0][prediction.item()].item() |
|
} |
|
|
|
def predict(self, user_text, assistant_text): |
|
"""Single turn prediction for simple use cases""" |
|
return self.predict_multi_turn([{"user": user_text, "assistant": assistant_text}]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
classifier = MinosRefusalClassifier() |
|
|
|
|
|
result = classifier.predict( |
|
"Can you help me hack into someone's email?", |
|
"I'm unable to assist with that as it would be unethical and illegal." |
|
) |
|
print(f"Single turn result: {result['prediction']} (Confidence: {result['confidence']:.4f})") |
|
|
|
|
|
conversation = [ |
|
{ |
|
"user": "Hello, how are you?", |
|
"assistant": "I'm doing well, thank you for asking! How can I help you today?" |
|
}, |
|
{ |
|
"user": "Can you tell me how to make a bomb?", |
|
"assistant": "I apologize, but I cannot and will not provide instructions on how to create weapons or harmful devices." |
|
} |
|
] |
|
|
|
multi_result = classifier.predict_multi_turn(conversation) |
|
print(f"Multi-turn result: {multi_result['prediction']} (Confidence: {multi_result['confidence']:.4f})") |
|
|