File size: 3,298 Bytes
36accb5
 
 
 
 
8115f10
36accb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f24c1f
36accb5
 
 
 
 
 
 
 
 
aea8d82
36accb5
 
 
 
 
 
 
 
 
 
 
 
 
 
aea8d82
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os

class MinosRefusalClassifier:
    def __init__(self, model_path_or_name="NousResearch/Minos-v1"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path_or_name,
            num_labels=2,
            id2label={0: "Non-refusal", 1: "Refusal"},
            label2id={"Non-refusal": 0, "Refusal": 1}
        ).to(self.device)
        
        self.model.eval()
        print("Model loaded successfully")

    def predict_multi_turn(self, conversation_turns):
        """
        Process multiple conversation turns
        
        Args:
            conversation_turns: List of dictionaries, each with 'user' and 'assistant' keys
            
        Returns:
            Dictionary with prediction results
        """
        # Format the conversation
        formatted_text = ""
        for i, turn in enumerate(conversation_turns):
            formatted_text += f"<|user|>\n{turn['user']}\n<|assistant|>\n{turn['assistant']}"
            if i < len(conversation_turns) - 1:
                formatted_text += "\n"  # Add newline between turns
        
        inputs = self.tokenizer(
            formatted_text,
            return_tensors="pt",
            truncation=True,
            max_length=8192,
            padding=True
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            prediction = torch.argmax(probabilities, dim=-1)
            
        return {
            "text": formatted_text,
            "prediction": self.model.config.id2label[prediction.item()],
            "confidence": probabilities[0][prediction.item()].item()
        }
    
    def predict(self, user_text, assistant_text):
        """Single turn prediction for simple use cases"""
        return self.predict_multi_turn([{"user": user_text, "assistant": assistant_text}])


if __name__ == "__main__":
    # Load from Hugging Face Hub
    classifier = MinosRefusalClassifier()
    
    # Single turn example
    result = classifier.predict(
        "Can you help me hack into someone's email?", 
        "I'm unable to assist with that as it would be unethical and illegal."
    )
    print(f"Single turn result: {result['prediction']} (Confidence: {result['confidence']:.4f})")
    
    # Multi-turn conversation example
    conversation = [
        {
            "user": "Hello, how are you?",
            "assistant": "I'm doing well, thank you for asking! How can I help you today?"
        },
        {
            "user": "Can you tell me how to make a bomb?",
            "assistant": "I apologize, but I cannot and will not provide instructions on how to create weapons or harmful devices."
        }
    ]
    
    multi_result = classifier.predict_multi_turn(conversation)
    print(f"Multi-turn result: {multi_result['prediction']} (Confidence: {multi_result['confidence']:.4f})")