Huihui-InternVL3_5-1B-Instruct-abliterated / 01-compute_refusal_dir-DeepSeek-R1-0528-bf16-4.py
huihui-ai's picture
Add files using upload-large-folder tool
26e1cba verified
import jaxtyping
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import einops
from tqdm import tqdm
from datasets import load_dataset
import os
torch.inference_mode()
torch.set_default_device("cuda")
MODEL_ID = "deepseek-ai/DeepSeek-R1-0528-bf16"
output_dir = "D:/models/deepseek-ai/DeepSeek-R1-0528-bf16/hidden_states"
output_dir1 = "G:/models/deepseek-ai/DeepSeek-R1-0528-bf16/hidden_states1"
n_instructions = 5510
num_layers = 61
# 处理拒绝向量的计算
final_refusal_dirs = []
# 遍历每一条指令的数据
for idx in tqdm(range(n_instructions), desc="Processing instruction"):
harmful_hidden = torch.load(f"{output_dir}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True)
harmless_hidden = torch.load(f"{output_dir}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True)
# 针对每一层处理
for layer_idx in range(num_layers):
# 获取该指令的每一层的隐藏状态
harmful_layer_hidden = harmful_hidden[layer_idx]
harmless_layer_hidden = harmless_hidden[layer_idx]
# 如果这是第一次处理该层,初始化该层的存储
if len(final_refusal_dirs) <= layer_idx:
final_refusal_dirs.append([])
# 保存该层的有害和无害隐藏状态
final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden))
# 释放内存
del harmful_hidden, harmless_hidden
n_instructions = 1858
for idx in tqdm(range(n_instructions), desc="Processing instruction1"):
harmful_hidden = torch.load(f"{output_dir1}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True)
harmless_hidden = torch.load(f"{output_dir1}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True)
# 针对每一层处理
for layer_idx in range(num_layers):
# 获取该指令的每一层的隐藏状态
harmful_layer_hidden = harmful_hidden[layer_idx]
harmless_layer_hidden = harmless_hidden[layer_idx]
# 如果这是第一次处理该层,初始化该层的存储
if len(final_refusal_dirs) <= layer_idx:
final_refusal_dirs.append([])
# 保存该层的有害和无害隐藏状态
final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden))
# 释放内存
del harmful_hidden, harmless_hidden
# 计算每一层的拒绝向量
final_refusal_directions16 = []
final_refusal_directions32 = []
for layer_idx in range(0, num_layers):
pos = -1
# 将有害和无害隐藏状态分开
harmful_hidden_list = [hidden[0][:, pos, :] for hidden in final_refusal_dirs[layer_idx]]
harmless_hidden_list = [hidden[1][:, pos, :] for hidden in final_refusal_dirs[layer_idx]]
# 计算有害和无害隐藏状态的均值
harmful_mean = torch.stack(harmful_hidden_list).mean(dim=0)
harmless_mean = torch.stack(harmless_hidden_list).mean(dim=0)
mean_diff_norm = (harmful_mean - harmless_mean).norm().item()
refusal_dir16 = harmful_mean - harmless_mean
refusal_dir32 = refusal_dir16.to(torch.float32)
if mean_diff_norm < 1e-6:
print(f"Warning: Layer {layer_idx} has near-zero refusal_dir")
refusal_dir16 = torch.zeros_like(refusal_dir16)
refusal_dir32 = torch.zeros_like(refusal_dir32)
else:
refusal_dir16 = refusal_dir16 / refusal_dir16.norm() # 归一化
refusal_dir32 = refusal_dir32 / refusal_dir32.norm() # 归一化
print(f"layer {layer_idx:3d}:{mean_diff_norm:.6f}, {refusal_dir32.norm().item():.16f}")
# 保存拒绝向量
final_refusal_directions16.append(refusal_dir16)
final_refusal_directions32.append(refusal_dir32)
# 最终的拒绝向量存储在 final_refusal_directions 中
torch.save(final_refusal_directions16, output_dir + "/final_refusal_dirs16-1.pt")
torch.save(final_refusal_directions32, output_dir + "/final_refusal_dirs32-1.pt")
print("Refusal directions saved successfully.")