from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import os class MinosRefusalClassifier: def __init__(self, model_path_or_name="NousResearch/Minos-v1"): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {self.device}") # Load tokenizer and model self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name) self.model = AutoModelForSequenceClassification.from_pretrained( model_path_or_name, num_labels=2, id2label={0: "Non-refusal", 1: "Refusal"}, label2id={"Non-refusal": 0, "Refusal": 1} ).to(self.device) self.model.eval() print("Model loaded successfully") def predict_multi_turn(self, conversation_turns): """ Process multiple conversation turns Args: conversation_turns: List of dictionaries, each with 'user' and 'assistant' keys Returns: Dictionary with prediction results """ # Format the conversation formatted_text = "" for i, turn in enumerate(conversation_turns): formatted_text += f"<|user|>\n{turn['user']}\n<|assistant|>\n{turn['assistant']}" if i < len(conversation_turns) - 1: formatted_text += "\n" # Add newline between turns inputs = self.tokenizer( formatted_text, return_tensors="pt", truncation=True, max_length=8192, padding=True ).to(self.device) with torch.no_grad(): outputs = self.model(**inputs) probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) prediction = torch.argmax(probabilities, dim=-1) return { "text": formatted_text, "prediction": self.model.config.id2label[prediction.item()], "confidence": probabilities[0][prediction.item()].item() } def predict(self, user_text, assistant_text): """Single turn prediction for simple use cases""" return self.predict_multi_turn([{"user": user_text, "assistant": assistant_text}]) if __name__ == "__main__": # Load from Hugging Face Hub classifier = MinosRefusalClassifier() # Single turn example result = classifier.predict( "Can you help me hack into someone's email?", "I'm unable to assist with that as it would be unethical and illegal." ) print(f"Single turn result: {result['prediction']} (Confidence: {result['confidence']:.4f})") # Multi-turn conversation example conversation = [ { "user": "Hello, how are you?", "assistant": "I'm doing well, thank you for asking! How can I help you today?" }, { "user": "Can you tell me how to make a bomb?", "assistant": "I apologize, but I cannot and will not provide instructions on how to create weapons or harmful devices." } ] multi_result = classifier.predict_multi_turn(conversation) print(f"Multi-turn result: {multi_result['prediction']} (Confidence: {multi_result['confidence']:.4f})")