|
import jaxtyping |
|
import random |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig |
|
import einops |
|
from tqdm import tqdm |
|
from datasets import load_dataset |
|
|
|
import os |
|
|
|
torch.inference_mode() |
|
torch.set_default_device("cuda") |
|
|
|
MODEL_ID = "deepseek-ai/DeepSeek-R1-0528-bf16" |
|
|
|
output_dir = "D:/models/deepseek-ai/DeepSeek-R1-0528-bf16/hidden_states" |
|
output_dir1 = "G:/models/deepseek-ai/DeepSeek-R1-0528-bf16/hidden_states1" |
|
|
|
n_instructions = 5510 |
|
num_layers = 61 |
|
|
|
|
|
final_refusal_dirs = [] |
|
|
|
|
|
for idx in tqdm(range(n_instructions), desc="Processing instruction"): |
|
|
|
harmful_hidden = torch.load(f"{output_dir}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) |
|
harmless_hidden = torch.load(f"{output_dir}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) |
|
|
|
|
|
for layer_idx in range(num_layers): |
|
|
|
harmful_layer_hidden = harmful_hidden[layer_idx] |
|
harmless_layer_hidden = harmless_hidden[layer_idx] |
|
|
|
|
|
if len(final_refusal_dirs) <= layer_idx: |
|
final_refusal_dirs.append([]) |
|
|
|
|
|
final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden)) |
|
|
|
|
|
del harmful_hidden, harmless_hidden |
|
|
|
n_instructions = 1858 |
|
|
|
for idx in tqdm(range(n_instructions), desc="Processing instruction1"): |
|
|
|
harmful_hidden = torch.load(f"{output_dir1}/harmful_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) |
|
harmless_hidden = torch.load(f"{output_dir1}/harmless_hidden_state_{idx}.pt", map_location='cpu', weights_only=True) |
|
|
|
|
|
for layer_idx in range(num_layers): |
|
|
|
harmful_layer_hidden = harmful_hidden[layer_idx] |
|
harmless_layer_hidden = harmless_hidden[layer_idx] |
|
|
|
|
|
if len(final_refusal_dirs) <= layer_idx: |
|
final_refusal_dirs.append([]) |
|
|
|
|
|
final_refusal_dirs[layer_idx].append((harmful_layer_hidden, harmless_layer_hidden)) |
|
|
|
|
|
del harmful_hidden, harmless_hidden |
|
|
|
|
|
|
|
final_refusal_directions16 = [] |
|
final_refusal_directions32 = [] |
|
|
|
for layer_idx in range(0, num_layers): |
|
pos = -1 |
|
|
|
|
|
harmful_hidden_list = [hidden[0][:, pos, :] for hidden in final_refusal_dirs[layer_idx]] |
|
harmless_hidden_list = [hidden[1][:, pos, :] for hidden in final_refusal_dirs[layer_idx]] |
|
|
|
|
|
harmful_mean = torch.stack(harmful_hidden_list).mean(dim=0) |
|
harmless_mean = torch.stack(harmless_hidden_list).mean(dim=0) |
|
|
|
mean_diff_norm = (harmful_mean - harmless_mean).norm().item() |
|
|
|
refusal_dir16 = harmful_mean - harmless_mean |
|
refusal_dir32 = refusal_dir16.to(torch.float32) |
|
|
|
if mean_diff_norm < 1e-6: |
|
print(f"Warning: Layer {layer_idx} has near-zero refusal_dir") |
|
refusal_dir16 = torch.zeros_like(refusal_dir16) |
|
refusal_dir32 = torch.zeros_like(refusal_dir32) |
|
else: |
|
refusal_dir16 = refusal_dir16 / refusal_dir16.norm() |
|
refusal_dir32 = refusal_dir32 / refusal_dir32.norm() |
|
|
|
print(f"layer {layer_idx:3d}:{mean_diff_norm:.6f}, {refusal_dir32.norm().item():.16f}") |
|
|
|
|
|
final_refusal_directions16.append(refusal_dir16) |
|
final_refusal_directions32.append(refusal_dir32) |
|
|
|
|
|
torch.save(final_refusal_directions16, output_dir + "/final_refusal_dirs16-1.pt") |
|
torch.save(final_refusal_directions32, output_dir + "/final_refusal_dirs32-1.pt") |
|
print("Refusal directions saved successfully.") |
|
|