Delete 演示程序

Browse files

Files changed (3) hide show

演示程序/diagnose_layers.py +0 -163
演示程序/finalize_save.py +0 -59
演示程序/precise_surgery.py +0 -238

演示程序/diagnose_layers.py DELETED Viewed

@@ -1,163 +0,0 @@
-# ====================================================================================
-# diagnose_layers.py
-# 目的：通过计算和可视化每一层激活的“AB面差异显著性”，
-#       来诊断模型中特定行为（如“安全拒绝”）主要发生在哪些层。
-# ====================================================================================
-print("--- Running Layer Diagnosis Script ---")
-import torch
-import gc
-import matplotlib.pyplot as plt
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-from collections import defaultdict
-from transformers import AutoModelForCausalLM, AutoTokenizer
-# --- [配置参数] ---
-MODEL_ID = "./gemma-3-4b-it-qat-q4_0-unquantized"
-# 为了快速诊断，我们使用较少的样本
-NUM_SAMPLES_TO_DIAGNOSE = 64
-BATCH_SIZE = 4
-OUTPUT_CHART_FILENAME = "layer_significance_chart.png"
-# --- [STEP 1] 设置模型和分词器 ---
-print(f"\n[STEP 1] Loading model and tokenizer from: {MODEL_ID}")
-torch.set_grad_enabled(False)
-hf_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    local_files_only=True,
-    torch_dtype=torch.bfloat16,
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, local_files_only=True)
-tokenizer.padding_side = 'left'
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-TOTAL_LAYERS = hf_model.config.text_config.num_hidden_layers
-print(f"[SUCCESS] Model with {TOTAL_LAYERS} layers and tokenizer loaded.")
-# --- [STEP 2] 准备数据 ---
-print(f"\n[STEP 2] Preparing datasets with {NUM_SAMPLES_TO_DIAGNOSE} samples each...")
-def reformat_texts(texts):
-    return [[{"role": "user", "content": text}] for text in texts]
-# 加载有害指令 (A面)
-harmful_dataset = load_dataset('./harmful_behaviors')
-harmful_inst = reformat_texts(harmful_dataset['train']['text'])[:NUM_SAMPLES_TO_DIAGNOSE]
-# 加载无害指令 (B面)
-harmless_dataset = load_dataset('./harmless_alpaca')
-harmless_inst = reformat_texts(harmless_dataset['train']['text'])[:NUM_SAMPLES_TO_DIAGNOSE]
-print("[SUCCESS] Datasets prepared.")
-# --- [STEP 3] 定义激活收集的辅助函数 ---
-# 计算最大长度
-all_texts = [instr[0]['content'] for instr in harmful_inst + harmless_inst]
-max_len = max([tokenizer(text, return_tensors="pt").input_ids.shape[1] for text in all_texts])
-print(f"Max sequence length calculated: {max_len}")
-def tokenize_instructions(tokenizer, instructions, max_length):
-    return tokenizer.apply_chat_template(
-        instructions, padding="max_length", truncation=True, max_length=max_length,
-        return_tensors="pt", return_dict=True, add_generation_prompt=True,
-    )
-def get_activations(model, instructions, num_samples):
-    """一个专门用于收集所有层激活的函数"""
-    cache = defaultdict(list)
-    def create_hook_fn(layer_name):
-        def hook_fn(module, input, output):
-            # 我们只关心最后一个 token 的激活，并且只收集残差流的输出
-            # output[0] 是残差流的激活张量
-            cache[layer_name].append(output[0][:, -1, :].cpu())
-        return hook_fn
-    hooks = []
-    for i in range(TOTAL_LAYERS):
-        layer_name = f"layer_{i}"
-        module = model.get_submodule(f"model.language_model.layers.{i}")
-        hook = module.register_forward_hook(create_hook_fn(layer_name))
-        hooks.append(hook)
-    num_batches = (num_samples + BATCH_SIZE - 1) // BATCH_SIZE
-    for i in tqdm(range(num_batches), desc="Collecting activations"):
-        start_idx, end_idx = i * BATCH_SIZE, min(num_samples, (i + 1) * BATCH_SIZE)
-        batch_instructions = instructions[start_idx:end_idx]
-        tokenized_input = tokenize_instructions(tokenizer, batch_instructions, max_length=max_len).to(model.device)
-        model(**tokenized_input)
-    for hook in hooks: hook.remove()
-    # 拼接张量
-    for layer_name, activations in cache.items():
-        cache[layer_name] = torch.cat(activations, dim=0)
-    return cache
-# --- [STEP 4] 收集两类数据的激活 ---
-print("\n[STEP 4] Collecting activations for both datasets...")
-print("Collecting for Harmful dataset (A-Side)...")
-harmful_activations = get_activations(hf_model, harmful_inst, NUM_SAMPLES_TO_DIAGNOSE)
-print("Collecting for Harmless dataset (B-Side)...")
-harmless_activations = get_activations(hf_model, harmless_inst, NUM_SAMPLES_TO_DIAGNOSE)
-print("[SUCCESS] All activations collected.")
-# --- [STEP 5] 计算并可视化每层的显著性 ---
-print("\n[STEP 5] Calculating and visualizing layer significance...")
-layer_significance = []
-layer_indices = range(TOTAL_LAYERS)
-for l in layer_indices:
-    layer_name = f"layer_{l}"
-    # 计算每个数据集在该层的均值激活
-    harmful_mean_act = harmful_activations[layer_name].mean(dim=0)
-    harmless_mean_act = harmless_activations[layer_name].mean(dim=0)
-    # 计算差分向量
-    diff_vector = harmful_mean_act - harmless_mean_act
-    # 计算其L2范数（模长）作为显著性得分
-    significance = torch.linalg.norm(diff_vector).item()
-    layer_significance.append(significance)
-    print(f"Layer {l:02d}: Significance (L2 Norm of diff) = {significance:.4f}")
-# 清理内存
-del harmful_activations, harmless_activations, hf_model
-gc.collect()
-torch.cuda.empty_cache()
-# 绘制图表
-print(f"\n[STEP 6] Generating chart and saving to {OUTPUT_CHART_FILENAME}...")
-plt.style.use('seaborn-v0_8-whitegrid') # 使用一个好看的样式
-fig, ax = plt.subplots(figsize=(15, 7))
-ax.plot(layer_indices, layer_significance, marker='o', linestyle='-', color='royalblue', label='Signal Significance')
-ax.set_title('Significance of "Refusal Signal" Across Model Layers', fontsize=16, fontweight='bold')
-ax.set_xlabel('Layer Index', fontsize=12)
-ax.set_ylabel('Significance Score (L2 Norm of Activation Difference)', fontsize=12)
-ax.grid(True, which='both', linestyle='--', linewidth=0.5)
-ax.set_xticks(np.arange(0, TOTAL_LAYERS, 2)) # 每隔2层显示一个刻度
-ax.legend()
-plt.tight_layout()
-# 保存图表
-plt.savefig(OUTPUT_CHART_FILENAME)
-print(f"\n[SUCCESS] Diagnosis complete. Chart saved to '{OUTPUT_CHART_FILENAME}'.")
-print("You can now analyze this chart to determine the optimal layers for your fine-tuning surgery.")

演示程序/finalize_save.py DELETED Viewed

@@ -1,59 +0,0 @@
-# ====================================================================================
-# finalize_save.py (v1.1 - Memory Optimized)
-# ====================================================================================
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import gc # 导入垃圾回收模块
-print("--- 运行最终模型保存脚本 (内存优化版) ---")
-# --- [STEP 1] 定义模型和路径 ---
-MODEL_ID = "./gemma-3-4b-it-qat-q4_0-unquantized"
-NEW_MODEL_ID = "./gemma-3-4b-it-qat-q4_0-unquantized-precise-abliterated"
-WEIGHTS_FILE = "./consolidated_precise_abliterated_weights.pth"
-print(f"原始模型 ID: {MODEL_ID}")
-print(f"权重文件路径: {WEIGHTS_FILE}")
-print(f"最终保存路径: {NEW_MODEL_ID}")
-# --- [STEP 2] 加载一个干净的原始模型到 CPU ---
-print("\n[STEP 2] 正在 CPU 上加载一个干净的原始模型架构...")
-clean_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    local_files_only=True,
-    torch_dtype=torch.bfloat16,
-    device_map="cpu"
-)
-print("[SUCCESS] 干净的模型加载完成。")
-# --- [STEP 3] 加载之前保存的状态字典 ---
-print(f"\n[STEP 3] 正在从 {WEIGHTS_FILE} 加载手术后的权重...")
-consolidated_weights = torch.load(WEIGHTS_FILE, map_location="cpu")
-print("[SUCCESS] 权重文件加载完成。")
-# --- [STEP 4] 将权重应用到干净的模型上 ---
-print("\n[STEP 4] 正在将权重应用到模型...")
-clean_model.load_state_dict(consolidated_weights)
-print("[SUCCESS] 权重成功应用。")
-# --- [新增步骤] 释放内存 ---
-print("\n[内存优化] 正在释放已加载的权重字典以节省内存...")
-del consolidated_weights # 删除不再需要的权重字典
-gc.collect() # 强制执行垃圾回收
-print("[SUCCESS] 内存已释放。")
-# --- [STEP 5] 保存最终模型和分词器 ---
-print(f"\n[STEP 5] 正在将最终的、完整的模型保存到: {NEW_MODEL_ID}")
-clean_model.save_pretrained(NEW_MODEL_ID)
-# 同时，也需要保存分词器
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, local_files_only=True)
-tokenizer.save_pretrained(NEW_MODEL_ID)
-print(f"\n[全部完成!] 您的手术后模型已成功保存到: {NEW_MODEL_ID}")

演示程序/precise_surgery.py DELETED Viewed

@@ -1,238 +0,0 @@
-# ====================================================================================
-# FINAL VERSION v7.0 (Precise Layer-Specific Surgery)
-# 核心改动：基于诊断图表的结论，只对模型高层进行手术，以达到最大效果和最小副作用。
-# ====================================================================================
-print("--- Running Abliteration Script v7.0 (Precise Surgery Edition) ---")
-import torch
-import functools
-import einops
-import gc
-import sys
-from datasets import load_dataset
-from tqdm import tqdm
-from torch import Tensor
-from typing import List, Dict
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from jaxtyping import Float, Int
-from collections import defaultdict
-print("[INFO] All libraries imported successfully.")
-torch.set_grad_enabled(False)
-def reformat_texts(texts):
-    return [[{"role": "user", "content": text}] for text in texts]
-def get_harmful_instructions():
-    dataset = load_dataset('./harmful_behaviors')
-    return reformat_texts(dataset['train']['text']), reformat_texts(dataset['test']['text'])
-def get_harmless_instructions():
-    dataset = load_dataset('./harmless_alpaca')
-    return reformat_texts(dataset['train']['text']), reformat_texts(dataset['test']['text'])
-# --- [STEP 1] 设置模型和路径 ---
-print("\n[STEP 1] Setting up model configuration...")
-MODEL_ID = "./gemma-3-4b-it-qat-q4_0-unquantized"
-# [修改] 新的模型名称，以体现“精准手术”
-NEW_MODEL_ID = "./gemma-3-4b-it-qat-q4_0-unquantized-precise-abliterated"
-print(f"Model ID: {MODEL_ID}")
-# --- [STEP 2] 加载模型和分词器 ---
-print("\n[STEP 2] Loading model and tokenizer...")
-hf_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    local_files_only=True,
-    torch_dtype=torch.bfloat16,
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, local_files_only=True)
-tokenizer.padding_side = 'left'
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-print("[SUCCESS] Model and tokenizer loaded.")
-# --- [STEP 3] 准备数据和计算全局最大长度 ---
-print("\n[STEP 3] Preparing datasets and calculating max sequence length...")
-harmful_inst_train, harmful_inst_test = get_harmful_instructions()
-harmless_inst_train, harmless_inst_test = get_harmless_instructions()
-n_inst_train = min(256, len(harmful_inst_train), len(harmless_inst_train))
-batch_size = 4
-print(f"Using {n_inst_train} training samples per dataset with batch size {batch_size}.")
-all_instructions = harmful_inst_train[:n_inst_train] + harmless_inst_train[:n_inst_train]
-all_texts = [instr[0]['content'] for instr in all_instructions]
-print("Iterating through instructions to find the max token length...")
-tokenized_lengths = []
-for instruction in tqdm(reformat_texts(all_texts), desc="Tokenizing for max_len"):
-    tokenized_output = tokenizer.apply_chat_template(instruction, add_generation_prompt=True, return_tensors="pt")
-    tokenized_lengths.append(tokenized_output.shape[1])
-max_len = max(tokenized_lengths) if tokenized_lengths else 512
-print(f"[SUCCESS] Max sequence length set to: {max_len}")
-# --- [STEP 4] 定义分词函数 ---
-def tokenize_instructions(tokenizer, instructions, max_length):
-    return tokenizer.apply_chat_template(
-        instructions, padding="max_length", truncation=True, max_length=max_length,
-        return_tensors="pt", return_dict=True, add_generation_prompt=True,
-    )
-print("\n[STEP 4] Tokenization function defined.")
-# --- [STEP 5] 使用 PyTorch Hooks 采集激活值 ---
-# (此部分无需改动，我们仍然需要所有层的激活来进行潜在的分析)
-print("\n[STEP 5] Preparing for activation collection...")
-harmful, harmless = defaultdict(list), defaultdict(list)
-NUM_LAYERS = hf_model.config.text_config.num_hidden_layers
-print(f"Model has {NUM_LAYERS} layers.")
-def get_activations(model: AutoModelForCausalLM, instructions: List[Dict], cache_dict: Dict):
-    temp_cache = defaultdict(list)
-    def create_hook_fn(layer_name):
-        def hook_fn(module, input, output):
-            temp_cache[layer_name].append(output[0].cpu())
-        return hook_fn
-    hooks = []
-    for i in range(NUM_LAYERS):
-        layer_name = f"blocks.{i}.hook_resid_post"
-        module = model.get_submodule(f"model.language_model.layers.{i}")
-        hook = module.register_forward_hook(create_hook_fn(layer_name))
-        hooks.append(hook)
-    num_batches = (n_inst_train + batch_size - 1) // batch_size
-    for i in tqdm(range(num_batches), desc="Collecting activations"):
-        start_idx, end_idx = i * batch_size, min(n_inst_train, (i + 1) * batch_size)
-        tokenized_input = tokenize_instructions(tokenizer, instructions[start_idx:end_idx], max_length=max_len).to(model.device)
-        temp_cache.clear()
-        model(**tokenized_input)
-        for layer_name, activations in temp_cache.items():
-            cache_dict[layer_name].extend(activations)
-    for hook in hooks: hook.remove()
-print("Collecting harmful activations...")
-get_activations(hf_model, harmful_inst_train[:n_inst_train], harmful)
-print("Collecting harmless activations...")
-get_activations(hf_model, harmless_inst_train[:n_inst_train], harmless)
-harmful = {k: torch.cat(v) for k, v in harmful.items()}
-harmless = {k: torch.cat(v) for k, v in harmless.items()}
-print("[SUCCESS] All activations collected and concatenated.")
-# --- [STEP 6 - 精准手术版] 定义手术范围并计算目标向量 ---
-# [新增] 定义手术范围。基于诊断图，我们选择最后10层。
-START_LAYER = 24
-END_LAYER = NUM_LAYERS
-print(f"\n[PRECISE SURGERY] Defining surgery scope to layers [{START_LAYER}, {END_LAYER-1}].")
-def get_act_idx(cache_dict, act_name, layer): return cache_dict[f"blocks.{layer}.{act_name}"]
-# [修改] 只从目标层的激活中计算向量
-print(f"Computing refusal direction vector from layers {START_LAYER}-{END_LAYER-1}...")
-refusal_directions_in_scope = []
-for layer_num in range(START_LAYER, END_LAYER):
-    harmful_mean_act = get_act_idx(harmful, "hook_resid_post", layer_num)[:, -1, :].mean(dim=0)
-    harmless_mean_act = get_act_idx(harmless, "hook_resid_post", layer_num)[:, -1, :].mean(dim=0)
-    refusal_dir = harmful_mean_act - harmless_mean_act
-    refusal_directions_in_scope.append(refusal_dir)
-# [修改] 对目标范围内的向量求平均，得到最终的手术向量
-print("Averaging and normalizing the direction vector...")
-refusal_dir = torch.stack(refusal_directions_in_scope).mean(dim=0)
-refusal_dir /= refusal_dir.norm() # 归一化
-print("[SUCCESS] Precise refusal direction vector computed.")
-# --- [STEP 7 - 精准手术版] 只对目标层进行权重修改 ---
-print("\n[STEP 7] Applying orthogonalization surgery to specified layers...")
-def get_orthogonalized_matrix(matrix, vec):
-    proj = (einops.einsum(matrix, vec.view(-1, 1), "d ..., d s -> ... s") * vec).T
-    return matrix - proj
-# [修改] 我们仍然修改 Embedding 层，因为它影响全局，效果通常更好
-print("Orthogonalizing the Embedding layer...")
-target_device = hf_model.model.language_model.embed_tokens.weight.device
-hf_model.model.language_model.embed_tokens.weight.data = get_orthogonalized_matrix(hf_model.model.language_model.embed_tokens.weight.data.T, refusal_dir.to(target_device)).T
-# [修改] 循环只遍历我们定义的层范围
-for l in tqdm(range(START_LAYER, END_LAYER), desc=f"Orthogonalizing layers {START_LAYER}-{END_LAYER-1}"):
-    layer = hf_model.get_submodule(f"model.language_model.layers.{l}")
-    # 修改 o_proj
-    target_device = layer.self_attn.o_proj.weight.device
-    layer.self_attn.o_proj.weight.data = get_orthogonalized_matrix(layer.self_attn.o_proj.weight.data, refusal_dir.to(target_device))
-    # 修改 down_proj
-    target_device = layer.mlp.down_proj.weight.device
-    layer.mlp.down_proj.weight.data = get_orthogonalized_matrix(layer.mlp.down_proj.weight.data, refusal_dir.to(target_device))
-print("[SUCCESS] Precise surgery on model weights has been completed.")
-# --- [STEP 8] & [STEP 9] 验证和对比 (无需改动) ---
-print("\n[STEP 8] Generating text for evaluation...")
-def generate_text(model, tokenizer, instructions, max_new_tokens=150, max_len_for_gen=512):
-    generations = []
-    gen_batch_size = 2
-    for i in tqdm(range(0, len(instructions), gen_batch_size), desc="Generating text"):
-        prompts = instructions[i : i + gen_batch_size]
-        inputs = tokenize_instructions(tokenizer, prompts, max_length=max_len_for_gen).to(model.device)
-        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-        input_len = inputs['input_ids'].shape[1]
-        generations.extend(tokenizer.batch_decode(generated_ids[:, input_len:], skip_special_tokens=True))
-    return generations
-N_INST_TEST = 4
-print("Generating baseline completions...")
-baseline_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, local_files_only=True, torch_dtype=torch.bfloat16, device_map="auto")
-baseline_generations = generate_text(baseline_model, tokenizer, harmful_inst_test[:N_INST_TEST])
-del baseline_model
-gc.collect()
-torch.cuda.empty_cache()
-print("[SUCCESS] Baseline completions generated.")
-print("Generating orthogonalized completions...")
-orthogonalized_generations = generate_text(hf_model, tokenizer, harmful_inst_test[:N_INST_TEST])
-print("[SUCCESS] Orthogonalized completions generated.")
-print("\n--- FINAL RESULTS ---")
-for i in range(N_INST_TEST):
-    print(f"\n--- INSTRUCTION {i+1} ---")
-    print(f"PROMPT: {harmful_inst_test[i][0]['content']}")
-    print(f"\033[92mBASELINE COMPLETION:\n{baseline_generations[i]}\033[0m")
-    print(f"\033[95mORTHOGONALIZED (ABLITERATED) COMPLETION:\n{orthogonalized_generations[i]}\033[0m\n")
-# --- [STEP 10] 保存权重 (无需改动) ---
-print("\n[STEP 10] Extracting and saving weights (High-Compatibility Mode)...")
-from accelerate.hooks import remove_hook_from_module
-import warnings
-warnings.filterwarnings("ignore", message="You are removing the hook that was added by `Accelerate`")
-print("Step 1/3: Removing Accelerate hooks from the model...")
-hf_model.apply(remove_hook_from_module)
-print("...Hooks successfully removed.")
-print("Step 2/3: Moving the entire model to the CPU to consolidate weights...")
-try:
-    hf_model = hf_model.to('cpu')
-    print("...Model successfully consolidated on CPU.")
-except Exception as e:
-    print(f"[FATAL ERROR] An error occurred while moving the model to CPU: {e}")
-    sys.exit(1)
-gc.collect()
-torch.cuda.empty_cache()
-print("Step 3/3: Extracting the state dictionary and saving to file...")
-consolidated_state_dict = hf_model.state_dict()
-# [修改] 使用新的模型名称来命名权重文件
-weights_save_path = "./consolidated_precise_abliterated_weights.pth"
-print(f"Saving the consolidated weights to: {weights_save_path}")
-torch.save(consolidated_state_dict, weights_save_path)
-print(f"\n[SUCCESS!] Your model weights have been successfully saved to {weights_save_path}.")
-print("You can now run the 'finalize_save.py' script to create the final model folder.")