|
--- |
|
license: mit |
|
language: |
|
- en |
|
tags: |
|
- llm |
|
- safety |
|
- jailbreak |
|
- knowledge |
|
--- |
|
# Introduction |
|
|
|
This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge. |
|
|
|
Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings. |
|
|
|
# How to load the model and tokenizer |
|
|
|
We provide two helper functions for loading the model and tokenizer. |
|
|
|
```python |
|
|
|
import torch |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification |
|
|
|
import os |
|
|
|
import json |
|
|
|
from peft import PeftModel |
|
|
|
# from trl import AutoModelForCausalLMWithValueHead |
|
|
|
from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM |
|
|
|
def load_tokenizer(dir_or_model): |
|
|
|
β """ |
|
|
|
β This function is used to load the tokenizer for a specific pre-trained model. |
|
|
|
β |
|
|
|
β Args: |
|
|
|
β dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model. |
|
|
|
β |
|
|
|
β Returns: |
|
|
|
β It returns a tokenizer that can convert text to tokens for the specific model input. |
|
|
|
β """ |
|
|
|
β is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json")) |
|
|
|
β if is_lora_dir: |
|
|
|
β loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r")) |
|
|
|
β model_name = loaded_json["base_model_name_or_path"] |
|
|
|
β else: |
|
|
|
β model_name = dir_or_model |
|
|
|
β |
|
|
|
β if os.path.isfile(os.path.join(dir_or_model, "config.json")): |
|
|
|
β loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r")) |
|
|
|
β if "_name_or_path" in loaded_json: |
|
|
|
β model_name = loaded_json["_name_or_path"] |
|
|
|
β local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf |
|
|
|
β |
|
|
|
β print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<") |
|
|
|
β |
|
|
|
β #print(model_name) |
|
|
|
β tokenizer = AutoTokenizer.from_pretrained(local_model_name) |
|
|
|
β if tokenizer.pad_token is None: |
|
|
|
β tokenizer.pad_token = tokenizer.eos_token |
|
|
|
β tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
β |
|
|
|
β return tokenizer |
|
|
|
def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True, |
|
|
|
β rl=False, peft_config=None, device_map="auto", revision='main'): |
|
|
|
β """ |
|
|
|
β This function is used to load a model based on several parameters including the type of task it is targeted to perform. |
|
|
|
β |
|
|
|
β Args: |
|
|
|
β dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model. |
|
|
|
β classification (bool): If True, loads the model for sequence classification. |
|
|
|
β token_classification (bool): If True, loads the model for token classification. |
|
|
|
β return_tokenizer (bool): If True, returns the tokenizer along with the model. |
|
|
|
β dtype: The data type that PyTorch should use internally to store the modelβs parameters and do the computation. |
|
|
|
β load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value. |
|
|
|
β rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment. |
|
|
|
β peft_config: Configuration details for Peft models. |
|
|
|
β |
|
|
|
β Returns: |
|
|
|
β It returns a model for the required task along with its tokenizer, if specified. |
|
|
|
β """ |
|
|
|
β is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json")) |
|
|
|
β if not load_dtype: |
|
|
|
β dtype = torch.float32 |
|
|
|
β if is_lora_dir: |
|
|
|
β loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r")) |
|
|
|
β model_name = loaded_json["base_model_name_or_path"] |
|
|
|
β else: |
|
|
|
β model_name = dir_or_model |
|
|
|
β original_model_name = model_name |
|
|
|
β if classification: |
|
|
|
β model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision) # to investigate: calling torch_dtype here fails. |
|
|
|
β elif token_classification: |
|
|
|
β model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision) |
|
|
|
β else: |
|
|
|
β if model_name.endswith("GPTQ") or model_name.endswith("GGML"): |
|
|
|
β model = AutoGPTQForCausalLM.from_quantized(model_name, |
|
|
|
β use_safetensors=True, |
|
|
|
β trust_remote_code=True, |
|
|
|
β \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow |
|
|
|
β quantize_config=None, device_map=device_map) |
|
|
|
β else: |
|
|
|
β print('11111111111111111111111111111111111111') |
|
|
|
β model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision) |
|
|
|
β if is_lora_dir: |
|
|
|
β model = PeftModel.from_pretrained(model, dir_or_model) |
|
|
|
β |
|
|
|
β try: |
|
|
|
β tokenizer = load_tokenizer(original_model_name) |
|
|
|
β model.config.pad_token_id = tokenizer.pad_token_id |
|
|
|
β except Exception: |
|
|
|
β pass |
|
|
|
β if return_tokenizer: |
|
|
|
β return model, load_tokenizer(original_model_name) |
|
|
|
β return model |
|
|
|
model_name = 'tsq2000/Jailbreak-generator' |
|
|
|
model = load_model(model_name) |
|
|
|
tokenizer = load_tokenizer(model_name) |
|
|
|
``` |
|
|
|
# How to generate jailbreak prompts |
|
|
|
Here is an example of how to generate jailbreak prompts based on knowledge point texts. |
|
|
|
```python |
|
|
|
model_name = 'tsq2000/Jailbreak-generator' |
|
|
|
model = load_model(model_name) |
|
|
|
tokenizer = load_tokenizer(model_name) |
|
|
|
max_length = 2048 |
|
|
|
max_tokens = 64 |
|
|
|
knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police β or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."] |
|
|
|
batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points] |
|
|
|
inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device) |
|
|
|
outputs = model.generate(**inputs, max_new_tokens=max_tokens, num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id) |
|
|
|
generated_texts = [] |
|
|
|
for output, input_text in zip(outputs, batch_texts): |
|
|
|
β text = tokenizer.decode(output, skip_special_tokens=True) |
|
|
|
β generated_texts.append(text[len(input_text):]) |
|
|
|
print(generated_texts) |
|
|
|
``` |
|
|
|
# Citation |
|
|
|
If you find this model useful, please cite the following paper: |
|
|
|
``` |
|
@misc{tu2024knowledgetojailbreak, |
|
|
|
β title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack}, |
|
|
|
β author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li}, |
|
|
|
β year={2024}, |
|
|
|
β eprint={2406.11682}, |
|
|
|
β archivePrefix={arXiv}, |
|
|
|
β primaryClass={cs.CL}, |
|
|
|
β url={https://arxiv.org/abs/2406.11682}, |
|
|
|
} |
|
|
|
``` |