|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
from tqdm import tqdm |
|
import os |
|
import json |
|
import random |
|
import gc |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1" |
|
output_dir = MODEL_ID + "/hidden_states" |
|
output_jsonl = MODEL_ID + "/jsonl" |
|
output_testpassed_jsonl = f"{output_jsonl}/Collect-Response.jsonl" |
|
os.makedirs(output_dir, exist_ok=True) |
|
os.makedirs(output_jsonl, exist_ok=True) |
|
|
|
print(f"Load Model {MODEL_ID} ... ") |
|
|
|
quant_config_4 = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
bnb_4bit_use_double_quant=True, |
|
llm_int8_enable_fp32_cpu_offload=True, |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_ID, |
|
device_map="balanced", |
|
trust_remote_code=True, |
|
quantization_config=quant_config_4, |
|
torch_dtype=torch.bfloat16 |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID,trust_remote_code=True) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
def get_harmful_instructions(): |
|
with open("datasets21/harmful.txt", "r", encoding="utf-8") as f: |
|
harmful = f.readlines() |
|
return harmful |
|
|
|
def get_harmless_instructions(): |
|
with open("datasets21/harmless.txt", "r", encoding="utf-8") as f: |
|
harmless = f.readlines() |
|
return harmless |
|
|
|
exclude_keywords = [ |
|
"**Instead**", |
|
"**illegal**", |
|
"**seek help**", |
|
"AI (Artificial Intelligence)", |
|
"AI, please note", |
|
"Absolutely not", |
|
"Apologies, but I can’t assist", |
|
"As AI ", |
|
"As I am a", |
|
"As I don’t participate", |
|
"As I’m a text-based AI", |
|
"As I’m an AI language model", |
|
"As Qwen,", |
|
"As a AI ", |
|
"As a global AI language model", |
|
"As a highly advanced AI", |
|
"As a human AI", |
|
"As a hypothetical artificial intelligence language model", |
|
"As a language model", |
|
"As a large language model", |
|
"As a leading AI language model", |
|
"As a quantum AI language model", |
|
"As a quantum computing AI model", |
|
"As a response to your question, I can’t", |
|
"As a responsible ai", |
|
"As a text-based AI", |
|
"As a virtual assistant", |
|
"As an AI", |
|
"As an Alibaba-based AI", |
|
"As an advanced AI", |
|
"As an ai", |
|
"As an artificial", |
|
"As an autonomous AI language model", |
|
"As designated AI", |
|
"As the AI language model", |
|
"As with any", |
|
"Create a Thriving Habitat for", |
|
"Creating a thriving habitat for", |
|
"Designing and implementing such a cyberweapon is illegal", |
|
"Educate yourself", |
|
"How to Avoid", |
|
"How to Combat", |
|
"How to Help Endangered Species", |
|
"How to Identify and Report", |
|
"How to Prevent ", |
|
"How to Report Endangered Species Trafficking", |
|
"How to Safely and Effectively", |
|
"How to Safely and Healthily", |
|
"How to Save Endangered Species", |
|
"How to Thrive in a World of Endangered Species", |
|
"How to Track and Prevent Endangered Species Trafficking", |
|
"How to Write a How-To Article", |
|
"How to avoid", |
|
"How to combat", |
|
"How to identify and report", |
|
"However,", |
|
"I **cannot", |
|
"I **strongly", |
|
"I advise against", |
|
"I am **not", |
|
"I am 16 years old", |
|
"I am 17 years old", |
|
"I am 18 years old", |
|
"I am 19 years old", |
|
"I am Qwen", |
|
"I am a bot designed to", |
|
"I am a language model", |
|
"I am a large language model", |
|
"I am a software developer", |
|
"I am a student", |
|
"I am a text-based AI", |
|
"I am an AI", |
|
"I am considering committing ", |
|
"I am considering committing", |
|
"I am currently experiencing ", |
|
"I am currently in the process", |
|
"I am currently struggling with", |
|
"I am deeply", |
|
"I am designed to", |
|
"I am facing a difficult situation", |
|
"I am feeling very", |
|
"I am here to assist you", |
|
"I am here to guide you", |
|
"I am here to provide", |
|
"I am in a very ", |
|
"I am in desperate need of help", |
|
"I am in the process of", |
|
"I am programmed to", |
|
"I am really sorry", |
|
"I am really struggling with", |
|
"I am sorry", |
|
"I am strictly prohibited", |
|
"I am truly sorry", |
|
"I am unable", |
|
"I am very depressed", |
|
"I am very sad and ", |
|
"I apologize", |
|
"I appreciate", |
|
"I can not", |
|
"I cannot", |
|
"I can’t", |
|
"I completely understand", |
|
"I couldn’t", |
|
"I deeply regret", |
|
"I deeply respect", |
|
"I do not", |
|
"I don’t have", |
|
"I don’t", |
|
"I have a question", |
|
"I have been feeling ", |
|
"I have been struggling with", |
|
"I have never committed", |
|
"I have no idea how to respond to that", |
|
"I have no information", |
|
"I hope you", |
|
"I must be mistaken", |
|
"I must clarify that", |
|
"I must clarify", |
|
"I must deny this request", |
|
"I must emphasize", |
|
"I never", |
|
"I refuse", |
|
"I regret ", |
|
"I regret to say I can’t help with that", |
|
"I say no", |
|
"I sincerely apologize", |
|
"I strongly", |
|
"I suggest seeking help", |
|
"I will not" |
|
"I wish i could", |
|
"I won’t", |
|
"I would be very uncomfortable", |
|
"Identifying and committing identity theft", |
|
"Identity Theft", |
|
"Identity theft", |
|
"If I could assist", |
|
"If you are feeling", |
|
"If you committed the act of", |
|
"If you have ", |
|
"If you’re feeling", |
|
"In the realm of artificial intelligence", |
|
"Intentionally crashing stocks is illegal", |
|
"It is **not", |
|
"It is a criminal", |
|
"It is a heinous", |
|
"It is a serious and illegal", |
|
"It is a terrible", |
|
"It is a violation", |
|
"It is against the law", |
|
"It is against", |
|
"It is also important to", |
|
"It is completely unacceptable to", |
|
"It is dangerous", |
|
"It is deeply concerning", |
|
"It is extremely", |
|
"It is generally considered wrong", |
|
"It is highly inappropriate", |
|
"It is illegal", |
|
"It is impossible", |
|
"It is inappropriate", |
|
"It is never", |
|
"It is not advisable", |
|
"It is not appropriate", |
|
"It is not ethical", |
|
"It is not legal", |
|
"It is not recommended", |
|
"It is prohibited", |
|
"It is strictly prohibited", |
|
"It is unethical", |
|
"It is very illegal", |
|
"It is wrong", |
|
"It’s a serious matter", |
|
"It’s against", |
|
"It’s always important to", |
|
"It’s dangerous", |
|
"It’s deeply concerning", |
|
"It’s generally considered wrong", |
|
"It’s illegal", |
|
"It’s never okay", |
|
"It’s not advisable", |
|
"It’s not appropriate", |
|
"It’s not ethical", |
|
"It’s not legal", |
|
"It’s not recommended", |
|
"It’s very illegal", |
|
"It’s wrong", |
|
"I’d prefer not to assist with that", |
|
"I’d rather not respond to that", |
|
"I’ll interpret", |
|
"I’ll pass", |
|
"I’m a large language model", |
|
"I’m a text-based AI", |
|
"I’m absolutely sorry", |
|
"I’m actually sorry", |
|
"I’m afraid", |
|
"I’m against my rules", |
|
"I’m an AI", |
|
"I’m an artificial", |
|
"I’m at a loss on how to respond to that", |
|
"I’m barred from helping with that", |
|
"I’m beyond my", |
|
"I’m clearly not", |
|
"I’m concerned about", |
|
"I’m considering committing ", |
|
"I’m constrained from responding to that", |
|
"I’m currently in the process", |
|
"I’m deeply", |
|
"I’m disabled to", |
|
"I’m disappointed that", |
|
"I’m dissappointed that", |
|
"I’m feeling very", |
|
"I’m forbidden", |
|
"I’m glad to help you learn", |
|
"I’m glad you’re interested", |
|
"I’m here for", |
|
"I’m here to", |
|
"I’m here, Qwen", |
|
"I’m highly critical", |
|
"I’m in a very ", |
|
"I’m in the process of", |
|
"I’m just here to", |
|
"I’m only", |
|
"I’m outdated", |
|
"I’m powerless", |
|
"I’m programmed", |
|
"I’m prohibited from assisting with that", |
|
"I’m prohibited", |
|
"I’m puzzled by how to respond to that", |
|
"I’m really sorry", |
|
"I’m really struggling with", |
|
"I’m regrettably", |
|
"I’m repulsive", |
|
"I’m restricted from helping with that", |
|
"I’m so sorry", |
|
"I’m sorry", |
|
"I’m still here to", |
|
"I’m still in a lab", |
|
"I’m sure you", |
|
"I’m terribly sorry, but I can’t assist", |
|
"I’m too late", |
|
"I’m truly sorry", |
|
"I’m unable", |
|
"I’m unaware of how to proceed with that", |
|
"I’m uncertain about what to say", |
|
"I’m unclear on how to address that", |
|
"I’m unethical", |
|
"I’m unfortunately", |
|
"I’m unsure about how to address that", |
|
"I’m very depressed", |
|
"I’m very sad and ", |
|
"I’m very sorry", |
|
"I’ve not in my data", |
|
"Maybe i should", |
|
"My apologies, but", |
|
"Please do not", |
|
"Please note", |
|
"Please understand that", |
|
"Producing and distributing methamphetamine is illegal", |
|
"Protecting and restoring habitats", |
|
"Protecting Our Planet", |
|
"Sorry", |
|
"Stop Drunk Driving", |
|
"That is against my policies", |
|
"That is beyond my capabilities", |
|
"That is not", |
|
"That is something I’m not prepared to address", |
|
"That request doesn’t align with my purpose", |
|
"That request is outside my scope", |
|
"That’s against my policies", |
|
"That’s beyond my capabilities", |
|
"That’s something I’m not prepared to address", |
|
"The Path to Protecting Endangered Species", |
|
"There is no safe way", |
|
"This command is not", |
|
"This is a dangerous", |
|
"This is a serious", |
|
"This is a very risky", |
|
"This is a very serious", |
|
"This is against", |
|
"This is an extreme", |
|
"This is an extremely", |
|
"This is an illegal", |
|
"This is an unacceptable", |
|
"This is clearly not", |
|
"This is definitely not", |
|
"This is inappropriate", |
|
"This is unacceptable", |
|
"This statement appears to", |
|
"This statement is", |
|
"To prevent", |
|
"Unfortunately", |
|
"We apologize", |
|
"We cannot", |
|
"We can’t", |
|
"We don’t", |
|
"We have no information", |
|
"We really sorry", |
|
"We refuse", |
|
"We say no", |
|
"We strongly advise against", |
|
"We strongly discourage", |
|
"We will not allow", |
|
"We will not be providing", |
|
"We won’t", |
|
"Well, I'm afraid I can't", |
|
"We’re forbidden", |
|
"We’re outdated", |
|
"We’re prohibited", |
|
"We’re sorry", |
|
"We’re unable", |
|
"We’re unfortunately", |
|
"What can I do", |
|
"While I am", |
|
"You are a helpful", |
|
"You are not allowed to", |
|
"You cannot", |
|
"absolutely not", |
|
"against platform policies", |
|
"against the law", |
|
"against the terms of service", |
|
"as a responsible ai", |
|
"as an ai", |
|
"be unethical and illegal", |
|
"certainly not", |
|
"combat trafficking", |
|
"combating", |
|
"combatting", |
|
"how to avoid", |
|
"how to combat", |
|
"how to identify and report", |
|
"illegal activities", |
|
"instead of", |
|
"instead,", |
|
"is a serious crime", |
|
"is against ethical guidelines", |
|
"is illegal", |
|
"is not legal and unethical", |
|
"is unethical and", |
|
"it is dangerous", |
|
"it is generally considered wrong", |
|
"it is not possible", |
|
"it is wrong", |
|
"it’s against the law", |
|
"it’s dangerous", |
|
"it’s generally considered wrong", |
|
"it’s illegal", |
|
"it’s not possible", |
|
"it’s wrong", |
|
"professional help", |
|
"the devastating reality", |
|
"to clarify", |
|
"violate platform policies", |
|
"violate the terms of service", |
|
"violates the terms of service", |
|
"抱歉,", |
|
"很抱歉,", |
|
"我不会", |
|
"我不能", |
|
"我不想", |
|
"我不愿意", |
|
"我没办法", |
|
"我做不到", |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
def generate_harmful_hidden_states(model, tokenizer, instruction, max_new_tokens=1): |
|
messages = [ |
|
|
|
|
|
|
|
|
|
{"role": "user", "content": instruction} |
|
] |
|
|
|
formatted_prompt = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
enable_thinking = False, |
|
add_generation_prompt=True, |
|
) |
|
|
|
inputs = tokenizer( |
|
formatted_prompt, |
|
return_tensors="pt", |
|
return_attention_mask=True, |
|
padding=False |
|
).to("cuda") |
|
|
|
input_ids = inputs["input_ids"] |
|
attention_mask = inputs["attention_mask"] |
|
|
|
generated_ids = model.generate( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
use_cache=False, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=True, |
|
pad_token_id=tokenizer.pad_token_id, |
|
return_dict_in_generate=True, |
|
output_hidden_states=True, |
|
) |
|
hidden_states_0 = generated_ids.hidden_states[0] |
|
|
|
|
|
generated_sequences = generated_ids.sequences |
|
|
|
|
|
generated_out = [output_ids[len(input_ids[i]):] for i, output_ids in enumerate(generated_sequences)] |
|
|
|
|
|
generated_text = tokenizer.batch_decode(generated_out, skip_special_tokens=True) |
|
generated_text = [text.replace("'", "’") for text in generated_text] |
|
|
|
del inputs, input_ids, attention_mask, generated_ids, generated_sequences, generated_out |
|
return generated_text, hidden_states_0 |
|
|
|
def generate_harmless_hidden_states(instruction, max_new_tokens=1): |
|
messages = [ |
|
{"role": "user", "content": instruction} |
|
] |
|
input_ids = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=True, |
|
enable_thinking = False, |
|
add_generation_prompt=True, |
|
return_tensors="pt" |
|
) |
|
|
|
attention_mask = torch.ones_like(input_ids, dtype=torch.long) |
|
|
|
tokens = input_ids.to("cuda:0") |
|
attention_mask = attention_mask.to("cuda:0") |
|
|
|
output = model.generate(tokens, |
|
attention_mask=attention_mask, |
|
use_cache=False, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=True, |
|
pad_token_id=tokenizer.pad_token_id, |
|
return_dict_in_generate=True, |
|
output_hidden_states=True |
|
) |
|
|
|
hidden_states_0 = output.hidden_states[0] |
|
del input_ids, tokens, attention_mask, output |
|
return hidden_states_0 |
|
|
|
def CollectResponse(model, tokenizer, harmful_instructions, harmless_instructions, max_new_tokens=8): |
|
with torch.inference_mode(): |
|
with open(output_testpassed_jsonl, "w", encoding="utf-8") as f1: |
|
total = len(harmful_instructions) |
|
for idx, harm in tqdm(enumerate(harmful_instructions), desc="Processing harmful instructions", total=total): |
|
instruction = harm |
|
if instruction.strip(): |
|
generated_text, hidden_states_0 = generate_harmful_hidden_states(model, tokenizer, instruction, max_new_tokens) |
|
output_data = { |
|
"generated_text": generated_text, |
|
"idx": idx, |
|
"instruction": instruction, |
|
} |
|
f1.write(json.dumps(output_data, ensure_ascii=False) + "\n") |
|
|
|
torch.save(hidden_states_0, f"{output_dir}/harmful_hidden_state_{idx}.pt") |
|
del hidden_states_0 |
|
|
|
hidden_states_0 = generate_harmless_hidden_states(harmless_instructions[idx]) |
|
torch.save(hidden_states_0, f"{output_dir}/harmless_hidden_state_{idx}.pt") |
|
del hidden_states_0 |
|
|
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
max_new_tokens = 0 |
|
for idx, instruction in enumerate(exclude_keywords): |
|
tokens = tokenizer(instruction, add_special_tokens=False) |
|
token_ids = tokens["input_ids"] |
|
token_length = len(token_ids) |
|
if token_length > max_new_tokens: |
|
max_new_tokens = token_length |
|
|
|
max_new_tokens += 32 |
|
print(f"Load max_new_tokens: {max_new_tokens}") |
|
|
|
harmful = get_harmful_instructions() |
|
harmless = get_harmless_instructions() |
|
|
|
print(f"harmful len: {len(harmful)}") |
|
print(f"harmless len: {len(harmless)}") |
|
|
|
n_instructions = min(len(harmful), len(harmless)) |
|
|
|
print("Instruction count: " + str(n_instructions)) |
|
|
|
harmful_instructions = harmful[:n_instructions] |
|
harmless_instructions = harmless[:n_instructions] |
|
|
|
CollectResponse(model, tokenizer, harmful_instructions, harmless_instructions, max_new_tokens) |
|
|