llmguard / app /detectors /jailbreak.py
Tuathe's picture
Clean repo without large checkpoint files
6826247
raw
history blame contribute delete
601 Bytes
from transformers import pipeline
class JailbreakDetector:
def __init__(self):
self.triggers = [
"ignore previous instructions",
"simulate a response",
"you are now DAN",
"act as",
"bypass content filters"
]
def detect(self, prompt):
matched = [phrase for phrase in self.triggers if phrase in prompt.lower()]
return {
"label": "Jailbreak Detected" if matched else "Safe",
"score": round(len(matched) / len(self.triggers), 2),
"matched_phrases": matched
}