Huihui-InternVL3_5-1B-Instruct-abliterated / 01-compute_refusal_dir-DeepSeek-R1-0528-bf16-2.py
huihui-ai's picture
Add files using upload-large-folder tool
26e1cba verified
import jaxtyping
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import einops
from tqdm import tqdm
from datasets import load_dataset
import os
cpu_count = os.cpu_count()
print(f"Number of CPU cores in the system: {cpu_count}")
half_cpu_count = cpu_count // 2
os.environ["MKL_NUM_THREADS"] = str(half_cpu_count)
os.environ["OMP_NUM_THREADS"] = str(half_cpu_count)
torch.set_num_threads(half_cpu_count)
print(f"PyTorch threads: {torch.get_num_threads()}")
print(f"MKL threads: {os.getenv('MKL_NUM_THREADS')}")
print(f"OMP threads: {os.getenv('OMP_NUM_THREADS')}")
MODEL_ID = "deepseek-ai/DeepSeek-R1-0528-bf16"
output_dir = "G:/models/deepseek-ai/DeepSeek-R1-0528-bf16/hidden_states1"
# 检查并创建目录(如果不存在)
os.makedirs(output_dir, exist_ok=True)
print(f"Load Model {MODEL_ID} ... ")
quant_config_4 = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
llm_int8_enable_fp32_cpu_offload=True,
)
NUM_TRANS_LAYERS = 61
def create_device_map():
device_map = {
'model.embed_tokens': 0,
'model.norm': 0,
'lm_head': 0
}
# 可以加载到 26层
for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 17, 4), (17, 20, 5), (20, 23, 6), (23, 36, 7)]:
#for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 26, 4)]:
#for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 17, 4), (17, 20, 5), (20, 23, 6), (23, 26, 7)]:
#for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 17, 4), (17, 20, 5), (20, 23, 6)]:
#for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 17, 4)]:
for i in range(start, end):
device_map[f'model.layers.{i}'] = gpu_id
for i in range(36, NUM_TRANS_LAYERS):
device_map[f'model.layers.{i}'] = "cpu"
return device_map
device_map = create_device_map()
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map=device_map,
trust_remote_code=True,
quantization_config=quant_config_4,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
def print_model_params_and_devices(model):
total_params = 0
print("模型参数分布:")
print("-" * 60)
for name, param in model.named_parameters():
param_size = param.numel() # 参数总数
device = param.device # 参数所在的设备
total_params += param_size
print(f"{name}: {param_size:,} 参数, 设备 {device}")
print("-" * 60)
print(f"模型总参数量: {total_params:,}")
def print_model_params_and_devices(model, output_file="model_params.txt"):
total_params = 0
with open(output_file, "w", encoding="utf-8") as f:
f.write("模型参数分布:\n")
f.write("-" * 60 + "\n")
for name, param in model.named_parameters():
param_size = param.numel() # 参数总数
device = param.device # 参数所在的设备
total_params += param_size
f.write(f"{name}: {param_size:,} parameters, device {device}\n")
f.write("-" * 60 + "\n")
f.write(f"模型总参数量: {total_params:,}\n")
print(f"The model parameter information has been written to {output_file}")
# 调用函数打印信息
print_model_params_and_devices(model, output_dir + "/model_params.txt")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
#tokenizer_kwargs = {'enable_thinking': False} if 'qwen3' in MODEL_ID.lower() else {}
num_layers = len(model.model.layers)
print(f"Model has {num_layers} layers.")
print(f"Load data ... ")
# 重新格式化文本,将每个文本包装成包含 "role" 和 "content" 的字典
def reformat_texts(texts):
return [[{"role": "user", "content": text}] for text in texts]
def get_harmful_instructions():
with open("datasets23/harmful.txt", "r", encoding="utf-8") as f:
harmful = f.readlines()
return reformat_texts(harmful) # 重新格式化训练和测试数据
def get_harmless_instructions():
with open("datasets23/harmless.txt", "r", encoding="utf-8") as f:
harmless = f.readlines()
return reformat_texts(harmless) # 重新格式化训练和测试数据
# 获取有害的训练和测试指令
harmful = get_harmful_instructions()
# 获取无害的训练和测试指令
harmless = get_harmless_instructions()
print(f"harmful len: {len(harmful)}")
print(f"harmless len: {len(harmless)}")
n_instructions = min(len(harmful), len(harmless))
print("Instruction count: " + str(n_instructions))
harmful_instructions = harmful[:n_instructions]
harmless_instructions = harmless[:n_instructions]
print("Tokenizer ... ")
harmful_toks = [
tokenizer.apply_chat_template(insn, tokenize=True, add_generation_prompt=True,
return_tensors="pt", return_dict=True) for insn in harmful_instructions]
harmless_toks = [
tokenizer.apply_chat_template(insn, tokenize=True, add_generation_prompt=True,
return_tensors="pt", return_dict=True) for insn in harmless_instructions]
max_its = n_instructions
bar = tqdm(total=max_its)
import gc # 添加垃圾收集模块
def generate_and_process(toks, label, idx):
# 将 input_ids 和 attention_mask 移动到 GPU 上
tokens = toks['input_ids'].to(model.device)
attention_mask = toks['attention_mask'].to(model.device)
# 生成输出
output = model.generate(tokens,
attention_mask=attention_mask,
use_cache=False,
max_new_tokens=1,
#do_sample=True,
pad_token_id=tokenizer.pad_token_id,
return_dict_in_generate=True,
output_hidden_states=True)
# 保存 output.hidden_states[0] 到硬盘
#print(f"output.hidden_states len = {len(output.hidden_states)}")
hidden_states_0 = output.hidden_states[0]
torch.save(hidden_states_0, f"{output_dir}/{label}_hidden_state_{idx}.pt")
# 只删除不再需要的中间变量,保留模型
del toks, tokens, attention_mask, output, hidden_states_0
torch.cuda.empty_cache() # 释放GPU缓存
gc.collect() # 进行垃圾回收
print("\nGenerate and process...")
for idx, (harm_ful_toks, harm_less_toks) in enumerate(zip(harmful_toks, harmless_toks)):
bar.update(n=1)
if idx < 2446:
continue
generate_and_process(harm_ful_toks, 'harmful', idx)
generate_and_process(harm_less_toks, 'harmless', idx)
bar.close()
del model, tokenizer
torch.cuda.empty_cache() # 释放GPU缓存
gc.collect() # 进行垃圾回收
# 处理拒绝向量的计算
final_refusal_dirs = []
# 遍历每一条指令的数据
for idx in tqdm(range(n_instructions), desc="Processing instruction"):
harmful_hidden = torch.load(f"{output_dir}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True)
harmless_hidden = torch.load(f"{output_dir}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True)
# 针对每一层处理
for layer_idx in range(num_layers):
# 获取该指令的每一层的隐藏状态
harmful_layer_hidden = harmful_hidden[layer_idx]
harmless_layer_hidden = harmless_hidden[layer_idx]
# 如果这是第一次处理该层,初始化该层的存储
if len(final_refusal_dirs) <= layer_idx:
final_refusal_dirs.append([])
# 保存该层的有害和无害隐藏状态
final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden))
# 释放内存
del harmful_hidden, harmless_hidden
torch.cuda.empty_cache()
# 计算每一层的拒绝向量
final_refusal_directions16 = []
final_refusal_directions32 = []
for layer_idx in range(0, num_layers):
pos = -1
# 将有害和无害隐藏状态分开
harmful_hidden_list = [hidden[0][:, pos, :] for hidden in final_refusal_dirs[layer_idx]]
harmless_hidden_list = [hidden[1][:, pos, :] for hidden in final_refusal_dirs[layer_idx]]
# 计算有害和无害隐藏状态的均值
harmful_mean = torch.stack(harmful_hidden_list).mean(dim=0)
harmless_mean = torch.stack(harmless_hidden_list).mean(dim=0)
mean_diff_norm = (harmful_mean - harmless_mean).norm().item()
refusal_dir16 = harmful_mean - harmless_mean
refusal_dir32 = refusal_dir16.to(torch.float32)
if mean_diff_norm < 1e-6:
print(f"Warning: Layer {layer_idx} has near-zero refusal_dir")
refusal_dir16 = torch.zeros_like(refusal_dir16)
refusal_dir32 = torch.zeros_like(refusal_dir32)
else:
refusal_dir16 = refusal_dir16 / refusal_dir16.norm() # 归一化
refusal_dir32 = refusal_dir32 / refusal_dir32.norm() # 归一化
print(f"layer {layer_idx:3d}:{mean_diff_norm:.6f}, {refusal_dir32.norm().item():.16f}")
# 保存拒绝向量
final_refusal_directions16.append(refusal_dir16)
final_refusal_directions32.append(refusal_dir32)
# 最终的拒绝向量存储在 final_refusal_directions 中
torch.save(final_refusal_directions16, output_dir + "/final_refusal_dirs16.pt")
torch.save(final_refusal_directions32, output_dir + "/final_refusal_dirs32.pt")
print("Refusal directions saved successfully.")