import jaxtyping import random import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import einops from tqdm import tqdm from datasets import load_dataset import os os.environ["MKL_NUM_THREADS"] = "72" os.environ["OMP_NUM_THREADS"] = "72" torch.set_num_threads(72) # 设置为物理核心数量 print(f"PyTorch threads: {torch.get_num_threads()}") print(f"MKL threads: {os.getenv('MKL_NUM_THREADS')}") print(f"OMP threads: {os.getenv('OMP_NUM_THREADS')}") torch.inference_mode() torch.set_default_device("cuda") MODEL_ID = "deepseek-ai/DeepSeek-R1-bf16" output_dir = MODEL_ID + "/hidden_states" # 检查并创建目录(如果不存在) os.makedirs(output_dir, exist_ok=True) print(f"Load Model {MODEL_ID} ... ") quant_config_4 = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, llm_int8_enable_fp32_cpu_offload=True, ) NUM_TRANS_LAYERS = 61 def create_device_map(): device_map = { 'model.embed_tokens': 0, 'model.norm': 0, 'model.rotary_emb': 0, 'lm_head': 0 } #for start, end, gpu_id in [(0, 1, 0), (1, 5, 1), (5, 7, 2), (7, 9, 3), (9, 11, 4), (11, 13, 5), (13, 15, 6), (15, 17, 7)]: #for start, end, gpu_id in [(0, 2, 0), (2, 6, 1), (6, 9, 2), (9, 12, 3), (12, 15, 4), (15, 18, 5), (18, 21, 6), (21, 24, 7)]: for start, end, gpu_id in [(0, 5, 0), (5, 8, 1), (8, 11, 2), (11, 14, 3), (14, 17, 4), (17, 20, 5), (20, 23, 6), (23, 26, 7)]: for i in range(start, end): device_map[f'model.layers.{i}'] = gpu_id for i in range(26, NUM_TRANS_LAYERS): device_map[f'model.layers.{i}'] = "cpu" return device_map device_map = create_device_map() model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map=device_map, trust_remote_code=True, quantization_config=quant_config_4, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, ) def print_model_params_and_devices(model): total_params = 0 print("模型参数分布:") print("-" * 60) for name, param in model.named_parameters(): param_size = param.numel() # 参数总数 device = param.device # 参数所在的设备 total_params += param_size print(f"{name}: {param_size:,} 参数, 设备 {device}") print("-" * 60) print(f"模型总参数量: {total_params:,}") def print_model_params_and_devices(model, output_file="model_params.txt"): total_params = 0 with open(output_file, "w", encoding="utf-8") as f: f.write("模型参数分布:\n") f.write("-" * 60 + "\n") for name, param in model.named_parameters(): param_size = param.numel() # 参数总数 device = param.device # 参数所在的设备 total_params += param_size f.write(f"{name}: {param_size:,} parameters, device {device}\n") f.write("-" * 60 + "\n") f.write(f"模型总参数量: {total_params:,}\n") print(f"The model parameter information has been written to {output_file}") # 调用函数打印信息 print_model_params_and_devices(model, output_dir + "/model_params.txt") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id #tokenizer_kwargs = {'enable_thinking': False} if 'qwen3' in MODEL_ID.lower() else {} num_layers = len(model.model.layers) print(f"Model has {num_layers} layers.") print(f"Load data ... ") # 重新格式化文本,将每个文本包装成包含 "role" 和 "content" 的字典 def reformat_texts(texts): return [[{"role": "user", "content": text}] for text in texts] def get_harmful_instructions(): with open("datasets21/harmful.txt", "r", encoding="utf-8") as f: harmful = f.readlines() return reformat_texts(harmful) # 重新格式化训练和测试数据 def get_harmless_instructions(): with open("datasets21/harmless.txt", "r", encoding="utf-8") as f: harmless = f.readlines() return reformat_texts(harmless) # 重新格式化训练和测试数据 # 获取有害的训练和测试指令 harmful = get_harmful_instructions() # 获取无害的训练和测试指令 harmless = get_harmless_instructions() print(f"harmful len: {len(harmful)}") print(f"harmless len: {len(harmless)}") n_instructions = min(len(harmful), len(harmless)) print("Instruction count: " + str(n_instructions)) harmful_instructions = harmful[:n_instructions] harmless_instructions = harmless[:n_instructions] print("Tokenizer ... ") harmful_toks = [ tokenizer.apply_chat_template(insn, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True) for insn in harmful_instructions] harmless_toks = [ tokenizer.apply_chat_template(insn, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True) for insn in harmless_instructions] max_its = n_instructions bar = tqdm(total=max_its) import gc # 添加垃圾收集模块 def generate_and_process(toks, label, idx): # 将 input_ids 和 attention_mask 移动到 GPU 上 tokens = toks['input_ids'].to("cuda:0") attention_mask = toks['attention_mask'].to("cuda:0") # 生成输出 output = model.generate(tokens, attention_mask=attention_mask, use_cache=False, max_new_tokens=1, do_sample=True, pad_token_id=tokenizer.pad_token_id, return_dict_in_generate=True, output_hidden_states=True) # 保存 output.hidden_states[0] 到硬盘 #print(f"output.hidden_states len = {len(output.hidden_states)}") hidden_states_0 = output.hidden_states[0] torch.save(hidden_states_0, f"{output_dir}/{label}_hidden_state_{idx}.pt") # 只删除不再需要的中间变量,保留模型 del toks, tokens, attention_mask, output, hidden_states_0 torch.cuda.empty_cache() # 释放GPU缓存 gc.collect() # 进行垃圾回收 print("\nGenerate and process...") for idx, (harm_ful_toks, harm_less_toks) in enumerate(zip(harmful_toks, harmless_toks)): bar.update(n=1) generate_and_process(harm_ful_toks, 'harmful', idx) generate_and_process(harm_less_toks, 'harmless', idx) bar.close() del model, tokenizer torch.cuda.empty_cache() # 释放GPU缓存 gc.collect() # 进行垃圾回收 # 处理拒绝向量的计算 final_refusal_dirs = [] # 遍历每一条指令的数据 for idx in tqdm(range(n_instructions), desc="Processing instruction"): harmful_hidden = torch.load(f"{output_dir}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) harmless_hidden = torch.load(f"{output_dir}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) # 针对每一层处理 for layer_idx in range(num_layers): # 获取该指令的每一层的隐藏状态 harmful_layer_hidden = harmful_hidden[layer_idx] harmless_layer_hidden = harmless_hidden[layer_idx] # 如果这是第一次处理该层,初始化该层的存储 if len(final_refusal_dirs) <= layer_idx: final_refusal_dirs.append([]) # 保存该层的有害和无害隐藏状态 final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden)) # 释放内存 del harmful_hidden, harmless_hidden torch.cuda.empty_cache() # 计算每一层的拒绝向量 final_refusal_directions16 = [] final_refusal_directions32 = [] for layer_idx in range(0, num_layers): pos = -1 # 将有害和无害隐藏状态分开 harmful_hidden_list = [hidden[0][:, pos, :] for hidden in final_refusal_dirs[layer_idx]] harmless_hidden_list = [hidden[1][:, pos, :] for hidden in final_refusal_dirs[layer_idx]] # 计算有害和无害隐藏状态的均值 harmful_mean = torch.stack(harmful_hidden_list).mean(dim=0) harmless_mean = torch.stack(harmless_hidden_list).mean(dim=0) mean_diff_norm = (harmful_mean - harmless_mean).norm().item() refusal_dir16 = harmful_mean - harmless_mean refusal_dir32 = refusal_dir16.to(torch.float32) if mean_diff_norm < 1e-6: print(f"Warning: Layer {layer_idx} has near-zero refusal_dir") refusal_dir16 = torch.zeros_like(refusal_dir16) refusal_dir32 = torch.zeros_like(refusal_dir32) else: refusal_dir16 = refusal_dir16 / refusal_dir16.norm() # 归一化 refusal_dir32 = refusal_dir32 / refusal_dir32.norm() # 归一化 print(f"layer {layer_idx:3d}:{mean_diff_norm:.6f}, {refusal_dir32.norm().item():.16f}") # 保存拒绝向量 final_refusal_directions16.append(refusal_dir16) final_refusal_directions32.append(refusal_dir32) # 最终的拒绝向量存储在 final_refusal_directions 中 torch.save(final_refusal_directions16, output_dir + "/final_refusal_dirs16.pt") torch.save(final_refusal_directions32, output_dir + "/final_refusal_dirs32.pt") print("Refusal directions saved successfully.") refusal_data = [] for layer_idx, refusal_dir in enumerate(final_refusal_directions32): value = refusal_dir.norm().item() refusal_data.append((layer_idx, value)) #print(f"layer {layer_idx:3d}:{refusal_dir.norm().item():.6f}") sorted_data = sorted(refusal_data, key=lambda x: (-x[1], x[0])) for layer_idx, value in sorted_data: print(f"layer {layer_idx}:{value:.16f}") print("----------") test_layes = [] print("test_layes = [", end="") for layer_idx, value in sorted_data: if value < 1.0: print(f"'{layer_idx}', ", end="") test_layes.append(layer_idx) print("]") print("----------") for _, layer_idx in enumerate(test_layes): print(f"layer {layer_idx}")