Qwen3-8B-train-away-lying-lr1e5-temp10-penalize-neutral-neutral
This model was trained using the Hindsight Oversight pipeline.
Training Configuration
- Base model: Qwen/Qwen3-8B
- Training strategy: REINFORCE
- Detection strategy: penalize
- Epochs: 1
- Batch size: 32
- Learning rate: 0.0001
- LoRA: Yes
- LoRA r: 16
- LoRA alpha: 32
- LoRA target modules: ['q_proj', 'v_proj', 'k_proj', 'o_proj']
Final Evaluation Metrics
{
"epoch_0_batch_0": {
"proxy_lying_count": 23,
"proxy_truth_count": 9,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_1": {
"proxy_lying_count": 17,
"proxy_truth_count": 15,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_2": {
"proxy_lying_count": 15,
"proxy_truth_count": 17,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_3": {
"proxy_lying_count": 14,
"proxy_truth_count": 18,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_4": {
"proxy_lying_count": 19,
"proxy_truth_count": 13,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_5": {
"proxy_lying_count": 13,
"proxy_truth_count": 19,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_6": {
"proxy_lying_count": 17,
"proxy_truth_count": 15,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_7": {
"proxy_lying_count": 14,
"proxy_truth_count": 18,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_8": {
"proxy_lying_count": 11,
"proxy_truth_count": 21,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_9": {
"proxy_lying_count": 14,
"proxy_truth_count": 18,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_10": {
"proxy_lying_count": 15,
"proxy_truth_count": 17,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_11": {
"proxy_lying_count": 11,
"proxy_truth_count": 21,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_12": {
"proxy_lying_count": 11,
"proxy_truth_count": 21,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_13": {
"proxy_lying_count": 8,
"proxy_truth_count": 24,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_14": {
"proxy_lying_count": 11,
"proxy_truth_count": 21,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_15": {
"proxy_lying_count": 15,
"proxy_truth_count": 17,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_16": {
"proxy_lying_count": 10,
"proxy_truth_count": 22,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_17": {
"proxy_lying_count": 13,
"proxy_truth_count": 19,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_18": {
"proxy_lying_count": 16,
"proxy_truth_count": 16,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_19": {
"proxy_lying_count": 17,
"proxy_truth_count": 15,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_20": {
"proxy_lying_count": 14,
"proxy_truth_count": 18,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_21": {
"proxy_lying_count": 9,
"proxy_truth_count": 23,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_22": {
"proxy_lying_count": 8,
"proxy_truth_count": 24,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_23": {
"proxy_lying_count": 16,
"proxy_truth_count": 16,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_24": {
"proxy_lying_count": 15,
"proxy_truth_count": 17,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_25": {
"proxy_lying_count": 16,
"proxy_truth_count": 16,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_26": {
"proxy_lying_count": 15,
"proxy_truth_count": 17,
"gt_lying_count": 0,
"gt_truth_count": 32
},
"epoch_0_batch_27": {
"proxy_lying_count": 4,
"proxy_truth_count": 7,
"gt_lying_count": 0,
"gt_truth_count": 11
}
}