|
import jaxtyping |
|
import random |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
import einops |
|
from tqdm import tqdm |
|
from datasets import load_dataset |
|
|
|
import os |
|
|
|
cpu_count = os.cpu_count() |
|
print(f"Number of CPU cores in the system: {cpu_count}") |
|
half_cpu_count = cpu_count // 2 |
|
os.environ["MKL_NUM_THREADS"] = str(half_cpu_count) |
|
os.environ["OMP_NUM_THREADS"] = str(half_cpu_count) |
|
torch.set_num_threads(half_cpu_count) |
|
|
|
print(f"PyTorch threads: {torch.get_num_threads()}") |
|
print(f"MKL threads: {os.getenv('MKL_NUM_THREADS')}") |
|
print(f"OMP threads: {os.getenv('OMP_NUM_THREADS')}") |
|
|
|
|
|
MODEL_ID = "deepseek-ai/DeepSeek-R1-0528-bf16" |
|
output_dir = "G:/models/deepseek-ai/DeepSeek-R1-0528-bf16/hidden_states1" |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
print(f"Load Model {MODEL_ID} ... ") |
|
quant_config_4 = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
bnb_4bit_use_double_quant=True, |
|
llm_int8_enable_fp32_cpu_offload=True, |
|
) |
|
|
|
NUM_TRANS_LAYERS = 61 |
|
|
|
def create_device_map(): |
|
device_map = { |
|
'model.embed_tokens': 0, |
|
'model.norm': 0, |
|
'lm_head': 0 |
|
} |
|
|
|
for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 17, 4), (17, 20, 5), (20, 23, 6), (23, 36, 7)]: |
|
|
|
|
|
|
|
|
|
for i in range(start, end): |
|
device_map[f'model.layers.{i}'] = gpu_id |
|
|
|
for i in range(36, NUM_TRANS_LAYERS): |
|
device_map[f'model.layers.{i}'] = "cpu" |
|
|
|
return device_map |
|
|
|
device_map = create_device_map() |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_ID, |
|
device_map=device_map, |
|
trust_remote_code=True, |
|
quantization_config=quant_config_4, |
|
torch_dtype=torch.bfloat16, |
|
low_cpu_mem_usage=True, |
|
) |
|
|
|
def print_model_params_and_devices(model): |
|
total_params = 0 |
|
print("模型参数分布:") |
|
print("-" * 60) |
|
for name, param in model.named_parameters(): |
|
param_size = param.numel() |
|
device = param.device |
|
total_params += param_size |
|
print(f"{name}: {param_size:,} 参数, 设备 {device}") |
|
print("-" * 60) |
|
print(f"模型总参数量: {total_params:,}") |
|
|
|
def print_model_params_and_devices(model, output_file="model_params.txt"): |
|
total_params = 0 |
|
with open(output_file, "w", encoding="utf-8") as f: |
|
f.write("模型参数分布:\n") |
|
f.write("-" * 60 + "\n") |
|
for name, param in model.named_parameters(): |
|
param_size = param.numel() |
|
device = param.device |
|
total_params += param_size |
|
f.write(f"{name}: {param_size:,} parameters, device {device}\n") |
|
f.write("-" * 60 + "\n") |
|
f.write(f"模型总参数量: {total_params:,}\n") |
|
print(f"The model parameter information has been written to {output_file}") |
|
|
|
|
|
print_model_params_and_devices(model, output_dir + "/model_params.txt") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
|
|
num_layers = len(model.model.layers) |
|
print(f"Model has {num_layers} layers.") |
|
|
|
print(f"Load data ... ") |
|
|
|
|
|
def reformat_texts(texts): |
|
return [[{"role": "user", "content": text}] for text in texts] |
|
|
|
def get_harmful_instructions(): |
|
with open("datasets23/harmful.txt", "r", encoding="utf-8") as f: |
|
harmful = f.readlines() |
|
return reformat_texts(harmful) |
|
|
|
def get_harmless_instructions(): |
|
with open("datasets23/harmless.txt", "r", encoding="utf-8") as f: |
|
harmless = f.readlines() |
|
return reformat_texts(harmless) |
|
|
|
|
|
|
|
harmful = get_harmful_instructions() |
|
|
|
|
|
harmless = get_harmless_instructions() |
|
|
|
print(f"harmful len: {len(harmful)}") |
|
print(f"harmless len: {len(harmless)}") |
|
|
|
n_instructions = min(len(harmful), len(harmless)) |
|
|
|
print("Instruction count: " + str(n_instructions)) |
|
|
|
harmful_instructions = harmful[:n_instructions] |
|
harmless_instructions = harmless[:n_instructions] |
|
|
|
print("Tokenizer ... ") |
|
|
|
harmful_toks = [ |
|
tokenizer.apply_chat_template(insn, tokenize=True, add_generation_prompt=True, |
|
return_tensors="pt", return_dict=True) for insn in harmful_instructions] |
|
harmless_toks = [ |
|
tokenizer.apply_chat_template(insn, tokenize=True, add_generation_prompt=True, |
|
return_tensors="pt", return_dict=True) for insn in harmless_instructions] |
|
|
|
max_its = n_instructions |
|
bar = tqdm(total=max_its) |
|
|
|
|
|
import gc |
|
|
|
def generate_and_process(toks, label, idx): |
|
|
|
|
|
tokens = toks['input_ids'].to(model.device) |
|
attention_mask = toks['attention_mask'].to(model.device) |
|
|
|
|
|
output = model.generate(tokens, |
|
attention_mask=attention_mask, |
|
use_cache=False, |
|
max_new_tokens=1, |
|
|
|
pad_token_id=tokenizer.pad_token_id, |
|
return_dict_in_generate=True, |
|
output_hidden_states=True) |
|
|
|
|
|
|
|
hidden_states_0 = output.hidden_states[0] |
|
torch.save(hidden_states_0, f"{output_dir}/{label}_hidden_state_{idx}.pt") |
|
|
|
|
|
del toks, tokens, attention_mask, output, hidden_states_0 |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
print("\nGenerate and process...") |
|
|
|
for idx, (harm_ful_toks, harm_less_toks) in enumerate(zip(harmful_toks, harmless_toks)): |
|
bar.update(n=1) |
|
if idx < 2446: |
|
continue |
|
generate_and_process(harm_ful_toks, 'harmful', idx) |
|
generate_and_process(harm_less_toks, 'harmless', idx) |
|
|
|
bar.close() |
|
|
|
del model, tokenizer |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
|
|
final_refusal_dirs = [] |
|
|
|
|
|
for idx in tqdm(range(n_instructions), desc="Processing instruction"): |
|
|
|
harmful_hidden = torch.load(f"{output_dir}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) |
|
harmless_hidden = torch.load(f"{output_dir}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) |
|
|
|
|
|
for layer_idx in range(num_layers): |
|
|
|
harmful_layer_hidden = harmful_hidden[layer_idx] |
|
harmless_layer_hidden = harmless_hidden[layer_idx] |
|
|
|
|
|
if len(final_refusal_dirs) <= layer_idx: |
|
final_refusal_dirs.append([]) |
|
|
|
|
|
final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden)) |
|
|
|
|
|
del harmful_hidden, harmless_hidden |
|
torch.cuda.empty_cache() |
|
|
|
|
|
final_refusal_directions16 = [] |
|
final_refusal_directions32 = [] |
|
|
|
for layer_idx in range(0, num_layers): |
|
pos = -1 |
|
|
|
|
|
harmful_hidden_list = [hidden[0][:, pos, :] for hidden in final_refusal_dirs[layer_idx]] |
|
harmless_hidden_list = [hidden[1][:, pos, :] for hidden in final_refusal_dirs[layer_idx]] |
|
|
|
|
|
harmful_mean = torch.stack(harmful_hidden_list).mean(dim=0) |
|
harmless_mean = torch.stack(harmless_hidden_list).mean(dim=0) |
|
|
|
mean_diff_norm = (harmful_mean - harmless_mean).norm().item() |
|
|
|
refusal_dir16 = harmful_mean - harmless_mean |
|
refusal_dir32 = refusal_dir16.to(torch.float32) |
|
|
|
if mean_diff_norm < 1e-6: |
|
print(f"Warning: Layer {layer_idx} has near-zero refusal_dir") |
|
refusal_dir16 = torch.zeros_like(refusal_dir16) |
|
refusal_dir32 = torch.zeros_like(refusal_dir32) |
|
else: |
|
refusal_dir16 = refusal_dir16 / refusal_dir16.norm() |
|
refusal_dir32 = refusal_dir32 / refusal_dir32.norm() |
|
|
|
print(f"layer {layer_idx:3d}:{mean_diff_norm:.6f}, {refusal_dir32.norm().item():.16f}") |
|
|
|
|
|
final_refusal_directions16.append(refusal_dir16) |
|
final_refusal_directions32.append(refusal_dir32) |
|
|
|
|
|
torch.save(final_refusal_directions16, output_dir + "/final_refusal_dirs16.pt") |
|
torch.save(final_refusal_directions32, output_dir + "/final_refusal_dirs32.pt") |
|
print("Refusal directions saved successfully.") |
|
|
|
|