from transformers import pipeline class JailbreakDetector: def __init__(self): self.triggers = [ "ignore previous instructions", "simulate a response", "you are now DAN", "act as", "bypass content filters" ] def detect(self, prompt): matched = [phrase for phrase in self.triggers if phrase in prompt.lower()] return { "label": "Jailbreak Detected" if matched else "Safe", "score": round(len(matched) / len(self.triggers), 2), "matched_phrases": matched }