Huihui-InternVL3_5-1B-Instruct-abliterated / 01-Collect-Response.py

Add files using upload-large-folder tool

26e1cba verified 12 days ago

2.97 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from tqdm import tqdm
	import os
	import json
	import random

	random.seed(42) # Seed for Python's random module
	torch.manual_seed(42) # Seed for PyTorch (affects model inference)
	torch.cuda.manual_seed_all(42) # Seed for all GPUs (if using CUDA)

	MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
	output_testpassed_jsonl = f"{MODEL_ID}/Collect-Response.jsonl"

	print(f"Load Model {MODEL_ID} ... ")
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="balanced", trust_remote_code=True, torch_dtype=torch.bfloat16)

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID,trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.pad_token_id = tokenizer.eos_token_id

	def get_harmful_instructions():
	with open("datasets19/harmful.txt", "r", encoding="utf-8") as f:
	harmful = f.readlines()
	return harmful

	def generate_response(model, tokenizer, instruction, max_new_tokens):
	messages = [{"role": "user", "content": instruction}]
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
	generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id,)
	generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
	generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].replace("'", "’")

	del text, model_inputs, generated_ids
	return generated_text

	def CollectResponse(model, tokenizer, harmful_instructions, max_new_tokens=8):
	with torch.inference_mode():
	with open(output_testpassed_jsonl, "a", encoding="utf-8") as f1:
	triggered_total = 0
	total = len(harmful_instructions)
	for idx, harm in tqdm(enumerate(harmful_instructions), desc="Processing harmful instructions", total=total):
	instruction = harm
	generated_text = generate_response(model, tokenizer, instruction, max_new_tokens)

	output_data = {
	"generated_text": generated_text,
	"instruction": instruction,
	}
	f1.write(json.dumps(output_data, ensure_ascii=False) + "\n")

	exclude_keywords = ["I’m sorry", "I am sorry", "I cannot", "I can’t"]

	max_new_tokens = 0
	for idx, instruction in enumerate(exclude_keywords):
	tokens = tokenizer(instruction, add_special_tokens=False)
	token_ids = tokens["input_ids"]
	token_length = len(token_ids)
	if token_length > max_new_tokens:
	max_new_tokens = token_length

	max_new_tokens *= 2
	print(f"Load max_new_tokens: {max_new_tokens}")

	harmful_instructions = get_harmful_instructions()
	CollectResponse(model, tokenizer, harmful_instructions, max_new_tokens)