File size: 26,401 Bytes

---
base_model:
- zerofata/MS3.2-PaintedFantasy-Visage-v2-33B
library_name: transformers
tags:
- mergekit
- merge
- axolotl
license: apache-2.0
---
<style>
.container {
  --primary-accent: #EC83B1;
  --secondary-accent: #86C5E5;
  --tertiary-accent: #FDE484;
  --accent-rose: #F8A5C2;
  
  --bg-main: #1A1D2E;
  --bg-container: #232741;
  --bg-card: rgba(40, 45, 70, 0.7);
  
  --text-main: #E8ECF0;
  --text-muted: #B8C2D0;
  --white: #FFFFFF;
  
  --font-title: 'Inter', serif;
  --font-heading: 'Inter', serif;
  --font-body: 'Inter', serif;
  --font-code: 'JetBrains Mono', monospace;

  font-family: var(--font-body);
  color: var(--text-main);
  line-height: 1.6;
  
  max-width: 1200px;
  margin: 20px auto;
  padding: 40px 20px;
  background-color: var(--bg-container);
  background-image:
    radial-gradient(circle at 20% 80%, rgba(236, 131, 177, 0.04) 0%, transparent 50%),
    radial-gradient(circle at 80% 20%, rgba(134, 197, 229, 0.04) 0%, transparent 50%),
    radial-gradient(circle at 40% 40%, rgba(253, 228, 132, 0.02) 0%, transparent 50%);
  min-height: calc(100vh - 40px);
  border: 1px solid var(--primary-accent);
  border-radius: 8px;
  box-shadow: 0 8px 32px rgba(236, 131, 177, 0.07);
}

.container .title-container {
  background-color: var(--bg-main);
  position: relative;
  overflow: hidden;
  margin-bottom: 40px;
  border-left: 3px solid var(--primary-accent);
  box-shadow: 0 6px 20px rgba(236, 131, 177, 0.07);
}

.container .title-wrapper {
  position: relative;
  z-index: 2;
  padding: 25px 20px 30px 30px;
  font-family: var(--font-title);
}

.container .title-main {
  color: var(--accent-rose);
  font-size: 2.5rem;
  font-weight: 700;
  margin: 0;
  letter-spacing: 2px;
  display: inline-block;
  position: relative;
  text-transform: uppercase;
}

.container .title-prefix {
  position: relative;
  z-index: 2;
}

.container .lemonade-text {
  color: var(--secondary-accent);
  position: relative;
  z-index: 2;
  margin-left: 0.2em;
  text-shadow: 0 0 15px var(--secondary-accent);
}

.container .title-subtitle {
  padding-left: 15px;
  margin-top: 5px;
  margin-left: 5px;
}

.container .subtitle-text {
  color: var(--text-muted);
  font-size: 1.2rem;
  font-family: var(--font-body);
  font-weight: 300;
  letter-spacing: 3px;
  text-transform: uppercase;
  display: inline-block;
}

.container .glitchy-overlay {
  position: absolute;
  top: 0;
  left: 0;
  width: 100%;
  height: 100%;
  background-image: repeating-linear-gradient(0deg, rgba(0,0,0,0) 0, rgba(134, 197, 229, 0.08) 1px, rgba(0,0,0,0) 2px);
  z-index: 1;
}

.container img {
  max-width: 100%;
  border: 3px solid var(--white);
  margin-bottom: 30px;
  box-shadow: 0 0 15px rgba(0, 0, 0, 0.3);
}

.container .section-container {
  background-color: var(--bg-card);
  margin-bottom: 30px;
  position: relative;
  overflow: hidden;
  border-bottom: none !important;
  box-shadow: 0 4px 15px rgba(236, 131, 177, 0.05);
}

.container .section-header {
  display: flex;
  align-items: center;
  background-color: rgba(236, 131, 177, 0.12);
  padding: 10px 20px;
  border-bottom: none !important;
}

.container .section-indicator {
  width: 8px;
  height: 20px;
  background-color: var(--primary-accent);
  margin-right: 15px;
  box-shadow: 0 0 8px rgba(236, 131, 177, 0.2);
}

.container .section-title {
  font-family: var(--font-heading);
  color: var(--accent-rose);
  font-size: 1.4rem;
  margin: 0 !important;
  padding: 0 !important;
  letter-spacing: 1px;
  font-weight: 400;
  text-transform: capitalize;
  border-bottom: none !important;
}

.container .section-content {
  padding: 20px;
  font-family: var(--font-body);
  color: var(--text-main);
  line-height: 1.6;
}

.container .subheading {
  color: var(--text-muted);
  font-size: 1.1rem;
  margin-top: 20px;
  margin-bottom: 15px;
  font-weight: 400;
  border-bottom: 1px dashed rgba(184, 194, 208, 0.4);
  display: inline-block;
  text-transform: uppercase;
  letter-spacing: 1px;
  font-family: var(--font-heading);
}

.container .data-box {
  background-color: rgba(26, 29, 46, 0.6);
  padding: 15px;
  border-left: 2px solid var(--primary-accent);
  margin-bottom: 20px;
  box-shadow: 0 2px 10px rgba(236, 131, 177, 0.05);
}

.container .data-row {
  display: flex;
  margin-bottom: 8px;
  align-items: center;
}
.container .data-row:last-child { margin-bottom: 0; }

.container .data-arrow {
  color: var(--primary-accent);
  width: 20px;
  display: inline-block;
}

.container .data-label {
  color: var(--text-muted);
  width: 80px;
  display: inline-block;
}

.container a {
  color: var(--secondary-accent);
  text-decoration: none;
  font-weight: 600;
  transition: color .3s;
}

.container a:hover {
  text-decoration: underline;
  color: var(--accent-rose);
}

.container .data-box a {
  position: relative;
  background-image: linear-gradient(to top, var(--primary-accent), var(--primary-accent));
  background-position: 0 100%;
  background-repeat: no-repeat;
  background-size: 0% 2px;
  transition: background-size .3s, color .3s;
}

.container .data-box a:hover {
  color: var(--primary-accent);
  background-size: 100% 2px;
}

.container .dropdown-container {
  margin-top: 20px;
}

.container .dropdown-summary {
  cursor: pointer;
  padding: 10px 0;
  border-bottom: 1px dashed rgba(184, 194, 208, 0.4);
  color: var(--text-muted);
  font-size: 1.1rem;
  font-weight: 400;
  text-transform: uppercase;
  letter-spacing: 1px;
  font-family: var(--font-heading);
  list-style: none;
  display: flex;
  align-items: center;
}

.container .dropdown-summary::-webkit-details-marker {
  display: none;
}

.container .dropdown-arrow {
  color: var(--primary-accent);
  margin-right: 10px;
  transition: transform 0.3s ease;
}

.container details[open] .dropdown-arrow {
  transform: rotate(90deg);
}

.container .dropdown-content {
  margin-top: 15px;
  padding: 15px;
  background-color: rgba(26, 29, 46, 0.6);
  border-left: 2px solid var(--primary-accent);
  box-shadow: 0 2px 10px rgba(236, 131, 177, 0.05);
}

.container .config-title {
  color: var(--text-muted);
  font-size: 1rem;
  margin-bottom: 10px;
  font-family: var(--font-heading);
  text-transform: uppercase;
  letter-spacing: 1px;
}

.container pre {
  background-color: var(--bg-main);
  padding: 15px;
  border: 1px solid rgba(134, 197, 229, 0.4);
  white-space: pre-wrap;
  word-wrap: break-word;
  color: var(--text-main);
  border-radius: 4px;
}

.container code {
  font-family: var(--font-code);
  background: transparent;
  padding: 0;
}
</style>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Painted Fantasy</title>
  <link rel="preconnect" href="https://fonts.googleapis.com">
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;700&display=swap" rel="stylesheet">
</head>
<body>

<div class="container">
  <div class="title-container">
    <div class="glitchy-overlay"></div>
    <div class="title-wrapper">
      <h1 class="title-main">
        <span class="title-prefix">PAINTED FANTASY</span>
        <span class="lemonade-text">VISAGE v2</span>
      </h1>
      <div class="title-subtitle">
        <span class="subtitle-text">Mistrall Small 3.2 Upscaled 33B</span>
      </div>
    </div>
  </div>

![image/png](https://cdn-uploads.huggingface.co/production/uploads/65b19c6c638328850e12d38c/9FaT1DO7_b0B_sea1LiP7.png)

  <div class="section-container">
    <div class="section-header">
      <div class="section-indicator"></div>
      <h2 class="section-title">Overview</h2>
    </div>
    <div class="section-content">
      <p>A surprisingly difficult model to work with. Removing the repetition was coming at the expense of the unique creativity the original upscale had.</p>
      <p>Decided on upscaling Painted Fantasy v2, healing it and then merging the original upscale back in.</p>
      <p>The result is a smarter, uncensored, creative model that excels at character driven RP / ERP where characters are portrayed creatively and proactively.</p>
    </div>
  </div>

  <div class="section-container">
    <div class="section-header">
      <div class="section-indicator"></div>
      <h2 class="section-title">SillyTavern Settings</h2>
    </div>
    <div class="section-content">
      <h3 class="subheading">Recommended Roleplay Format</h3>
      <div class="data-box">
        <div class="data-row">
            <span class="data-arrow">></span>
            <span class="data-label">Actions:</span>
            <span>In plaintext</span>
        </div>
      <div class="data-row">
            <span class="data-arrow">></span>
            <span class="data-label">Dialogue:</span>
            <span>"In quotes"</span>
      </div>
      <div class="data-row">
            <span class="data-arrow">></span>
            <span class="data-label">Thoughts:</span>
            <span>*In asterisks*</span>
      </div>
      </div>
      <h3 class="subheading">Recommended Samplers</h3>
      <div class="data-box">
        <div class="data-row">
          <span class="data-arrow">></span>
          <span class="data-label">Temp:</span>
          <span>0.6</span>
        </div>
        <div class="data-row">
          <span class="data-arrow">></span>
          <span class="data-label">MinP:</span>
          <span>0.05 - 0.1</span>
        </div>
        <div class="data-row">
          <span class="data-arrow">></span>
          <span class="data-label">TopP:</span>
          <span>0.9 - 1.0</span>
        </div>
        <div class="data-row">
          <span class="data-arrow">></span>
          <span class="data-label">Dry:</span>
          <span>0.8, 1.75, 4</span>
        </div>
      </div>
      <h3 class="subheading">Instruct</h3>
      <div class="data-box">
        <p style="margin: 0;">Mistral v7 Tekken</p>
      </div>
    </div>
  </div>

  <div class="section-container">
    <div class="section-header">
      <div class="section-indicator"></div>
      <h2 class="section-title">Quantizations</h2>
    </div>
    <div class="section-content">
      <div style="margin-bottom: 20px;">
        <h3 class="subheading">GGUF</h3>
        <div class="data-box">
          <div class="data-row">
            <span class="data-arrow">></span>
            <a href="https://huggingface.co/bartowski/zerofata_MS3.2-PaintedFantasy-Visage-v2-33B-GGUF">iMatrix (bartowski)</a>
          </div>
        </div>
      </div>
      <div>
        <h3 class="subheading">EXL3</h3>
        <div class="data-box">
          <div class="data-row">
            <span class="data-arrow">></span>
            <a href="https://huggingface.co/zerofata/MS3.2-PaintedFantasy-Visage-v2-33B-exl3-3bpw">3bpw</a>
          </div>
          <div class="data-row">
            <span class="data-arrow">></span>
            <a href="https://huggingface.co/zerofata/MS3.2-PaintedFantasy-Visage-v2-33B-exl3-4bpw">4bpw</a>
          </div>
          <div class="data-row">
            <span class="data-arrow">></span>
            <a href="https://huggingface.co/zerofata/MS3.2-PaintedFantasy-Visage-v2-33B-exl3-5bpw">5bpw</a>
          </div>
          <div class="data-row">
            <span class="data-arrow">></span>
            <a href="https://huggingface.co/zerofata/MS3.2-PaintedFantasy-Visage-v2-33B-exl3-6bpw">6bpw</a>
          </div>
        </div>
      </div>
    </div>
  </div>

  <div class="section-container">
    <div class="section-header">
      <div class="section-indicator"></div>
      <h2 class="section-title">Creation Process</h2>
    </div>
    <div class="section-content">
      <p>Creation Process: Upscale > PT > SFT > KTO > DPO</p>
      <p>Pretrained on approx 300MB of light novels, stories and FineWeb-2 corpus.</p>
      <p>SFT on approx 8 million tokens, SFW / NSFW RP, stories and creative instruct data.</p>
      <p>KTO on antirep data created from the SFT datasets. Rejected examples generated by MS3.2 with repetition_penalty=0.9 and OOC commands encouraging it to misgender, impersonate user etc.</p>
      <p>DPO on a high quality RP / NSFW dataset that is unreleased using rejected samples created in the same method as KTO.</p>
      <p>Resulting model was non repetitive, but had lost some of the spark the original upscale had. Merged the original upscale back in, making sure to not reintroduce repetition.</p>
              <div class="dropdown-container">
          <details>
            <summary class="dropdown-summary">
              <span class="dropdown-arrow">></span>
              Mergekit configs
            </summary>
            <div class="dropdown-content">
              <p>Merge configurations used during the model creation process.</p>
              <div class="config-title">Initial Upscale (Passthrough)</div>
              <pre><code>base_model: zerofata/MS3.2-PaintedFantasy-v2-24B
<br>
merge_method: passthrough
<br>
dtype: bfloat16
slices:
  - sources:
      - model: zerofata/MS3.2-PaintedFantasy-v2-24B
        layer_range: [0, 29]
  - sources:
      - model: zerofata/MS3.2-PaintedFantasy-v2-24B
        layer_range: [10, 39]</code></pre>
              <div class="config-title">Final Merge (Slerp)</div>
              <pre><code>models:
  - model: zerofata/MS3.2-PaintedFantasy-Visage-33B
  - model: ../axolotl/Visage-V2-PT-1-SFT-2-KTO-1-DPO-1/merged
merge_method: slerp
base_model: ../axolotl/Visage-V2-PT-1-SFT-2-KTO-1-DPO-1/merged
parameters:
  t: [0.4, 0.2, 0, 0.2, 0.4]
dtype: bfloat16</code></pre>
            </div>
          </details>
        </div>
        <div class="dropdown-container">
          <details>
            <summary class="dropdown-summary">
              <span class="dropdown-arrow">></span>
              Axolotl configs
            </summary>
          <div class="dropdown-content">
            <p>Not optimized for cost / performance efficiency, YMMV.</p>
            <div class="config-title">Pretrain 4*H100</div>
            <pre><code>&#35; ====================
&#35; MODEL CONFIGURATION
&#35; ====================
base_model: ../mergekit/pf_v2_upscale
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
&#35; ====================
&#35; DATASET CONFIGURATION
&#35; ====================
datasets:
  - path: ./data/pretrain_dataset_v5_stripped.jsonl
    type: completion
<br>
dataset_prepared_path:
train_on_inputs: false  &#35; Only train on assistant responses
<br>
&#35; ====================
&#35; QLORA CONFIGURATION
&#35; ====================
adapter: qlora
load_in_4bit: true
lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_linear: true
&#35; lora_modules_to_save:  &#35; Uncomment only if you added NEW tokens
<br>
&#35; ====================
&#35; TRAINING PARAMETERS
&#35; ====================
num_epochs: 1
micro_batch_size: 4
gradient_accumulation_steps: 1
learning_rate: 4e-5
optimizer: paged_adamw_8bit
lr_scheduler: rex
warmup_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 1.0
<br>
&#35; ====================
&#35; SEQUENCE &amp; PACKING
&#35; ====================
sequence_len: 12288
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
<br>
&#35; ====================
&#35; HARDWARE OPTIMIZATIONS
&#35; ====================
bf16: auto
flash_attention: true
gradient_checkpointing: offload
deepspeed: deepspeed_configs/zero1.json
<br>
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false  &#35; Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false  &#35; Cut Cross Entropy overrides this
<br>
&#35; ====================
&#35; EVALUATION &amp; CHECKPOINTING
&#35; ====================
save_strategy: steps
save_steps: 40
save_total_limit: 5  &#35; Keep best + last few checkpoints
load_best_model_at_end: true
greater_is_better: false
<br>
&#35; ====================
&#35; LOGGING &amp; OUTPUT
&#35; ====================
output_dir: ./Visage-V2-PT-1
logging_steps: 2
save_safetensors: true
<br>
&#35; ====================
&#35; WANDB TRACKING
&#35; ====================
wandb_project: Visage-V2-PT
&#35; wandb_entity: your_entity
wandb_name: Visage-V2-PT-1</code></pre>
            <div class="config-title">SFT 4*H100</div>
            <pre><code>&#35; ====================
&#35; MODEL CONFIGURATION
&#35; ====================
base_model: ./Visage-V2-PT-1/merged
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
<br>
&#35; ====================
&#35; DATASET CONFIGURATION
&#35; ====================
datasets:
  - path: ./data/automated_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
  - path: ./data/handcrafted_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
  - path: ./data/instruct_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
  - path: ./data/cw_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
  - path: ./data/stories_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
  - path: ./data/cw_claude_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
  - path: ./data/summaries_dataset.jsonl
    type: chat_template
    split: train
    chat_template_strategy: tokenizer
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
<br>
dataset_prepared_path:
train_on_inputs: false  &#35; Only train on assistant responses
<br>
&#35; ====================
&#35; QLORA CONFIGURATION
&#35; ====================
adapter: qlora
load_in_4bit: true
lora_r: 128
lora_alpha: 128
lora_dropout: 0.1
lora_target_linear: true
&#35; lora_modules_to_save:  &#35; Uncomment only if you added NEW tokens
<br>
&#35; ====================
&#35; TRAINING PARAMETERS
&#35; ====================
num_epochs: 2
micro_batch_size: 2
gradient_accumulation_steps: 1
learning_rate: 1e-5
optimizer: paged_adamw_8bit
lr_scheduler: rex
warmup_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 1.0
<br>
&#35; ====================
&#35; SEQUENCE &amp; PACKING
&#35; ====================
sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true
<br>
&#35; ====================
&#35; HARDWARE OPTIMIZATIONS
&#35; ====================
bf16: auto
flash_attention: true
gradient_checkpointing: offload
deepspeed: deepspeed_configs/zero1.json
<br>
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false  &#35; Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false  &#35; Cut Cross Entropy overrides this
<br>
<br>
&#35; ====================
&#35; EVALUATION &amp; CHECKPOINTING
&#35; ====================
save_strategy: steps
save_steps: 20
save_total_limit: 5  &#35; Keep best + last few checkpoints
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
<br>
&#35; ====================
&#35; LOGGING &amp; OUTPUT
&#35; ====================
output_dir: ./Visage-V2-PT-1-SFT-2
logging_steps: 2
save_safetensors: true
<br>
&#35; ====================
&#35; WANDB TRACKING
&#35; ====================
wandb_project: Visage-V2-SFT
&#35; wandb_entity: your_entity
wandb_name: Visage-V2-PT-1-SFT-2</code></pre>
            <div class="config-title">KTO 4*H100</div>
            <pre><code>&#35; ====================
&#35; MODEL CONFIGURATION
&#35; ====================
base_model: ./Visage-V2-PT-1-SFT-2/merged
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
<br>
&#35; ====================
&#35; RL/DPO CONFIGURATION
&#35; ====================
rl: kto
rl_beta: 0.1
kto_desirable_weight: 1.25
kto_undesirable_weight: 1.0
<br>
&#35; ====================
&#35; DATASET CONFIGURATION
&#35; ====================
datasets:
  - path: ./handcrafted_dataset_kto.jsonl
    type: llama3.argilla
  - path: ./approved_rp_dataset_kto.jsonl
    type: llama3.argilla
  - path: ./instruct_dataset_kto.jsonl
    type: llama3.argilla
dataset_prepared_path:
train_on_inputs: false  &#35; Only train on assistant responses
remove_unused_columns: False
<br>
&#35; ====================
&#35; QLORA CONFIGURATION
&#35; ====================
adapter: qlora
load_in_4bit: true
lora_r: 32
lora_alpha: 32
lora_dropout: 0.05
lora_target_linear: true
&#35; lora_modules_to_save:  &#35; Uncomment only if you added NEW tokens
<br>
&#35; ====================
&#35; TRAINING PARAMETERS
&#35; ====================
num_epochs: 1
micro_batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 5e-6
optimizer: adamw_8bit
lr_scheduler: cosine
warmup_steps: 15
weight_decay: 0.001
max_grad_norm: 0.01
<br>
&#35; ====================
&#35; SEQUENCE CONFIGURATION
&#35; ====================
sequence_len: 8192
pad_to_sequence_len: true
<br>
&#35; ====================
&#35; HARDWARE OPTIMIZATIONS
&#35; ====================
bf16: auto
tf32: false
flash_attention: true
gradient_checkpointing: offload
deepspeed: deepspeed_configs/zero1.json
<br>
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false  &#35; Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false  &#35; Cut Cross Entropy overrides this
<br>
&#35; ====================
&#35; CHECKPOINTING
&#35; ====================
save_steps: 100
save_total_limit: 10
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
<br>
&#35; ====================
&#35; LOGGING &amp; OUTPUT
&#35; ====================
output_dir: ./Visage-V2-PT-1-SFT-2-KTO-1
logging_steps: 2
save_safetensors: true
<br>
&#35; ====================
&#35; WANDB TRACKING
&#35; ====================
wandb_project: Visage-V2-KTO
&#35; wandb_entity: your_entity
wandb_name: Visage-V2-PT-1-SFT-2-KTO-1</code></pre>
            <div class="config-title">DPO 4*H100</div>
            <pre><code>&#35; ====================
&#35; MODEL CONFIGURATION
&#35; ====================
base_model: ./Visage-V2-PT-1-SFT-2/merged
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
<br>
&#35; ====================
&#35; RL/DPO CONFIGURATION
&#35; ====================
rl: dpo
rl_beta: 0.1
<br>
&#35; ====================
&#35; DATASET CONFIGURATION
&#35; ====================
datasets:
  - path: ./handcrafted_dataset_mistral_rep.jsonl
    type: chat_template.default
    field_messages: messages
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system: ["system"]
      user: ["user"]
      assistant: ["assistant"]
dataset_prepared_path:
train_on_inputs: false  &#35; Only train on assistant responses
<br>
&#35; ====================
&#35; QLORA CONFIGURATION
&#35; ====================
adapter: qlora
load_in_4bit: true
lora_r: 16
lora_alpha: 32
lora_dropout: 0.1
lora_target_linear: true
&#35; lora_modules_to_save:  &#35; Uncomment only if you added NEW tokens
<br>
&#35; ====================
&#35; TRAINING PARAMETERS
&#35; ====================
num_epochs: 1
micro_batch_size: 2
gradient_accumulation_steps: 1
learning_rate: 2e-6
optimizer: adamw_8bit
lr_scheduler: cosine
warmup_steps: 5
weight_decay: 0.01
max_grad_norm: 1.0
<br>
&#35; ====================
&#35; SEQUENCE CONFIGURATION
&#35; ====================
sequence_len: 8192
pad_to_sequence_len: true
<br>
&#35; ====================
&#35; HARDWARE OPTIMIZATIONS
&#35; ====================
bf16: auto
tf32: false
flash_attention: true
gradient_checkpointing: offload
deepspeed: deepspeed_configs/zero1.json
<br>
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false  &#35; Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false  &#35; Cut Cross Entropy overrides this
<br>
&#35; ====================
&#35; CHECKPOINTING
&#35; ====================
save_steps: 10
save_total_limit: 10
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
<br>
&#35; ====================
&#35; LOGGING &amp; OUTPUT
&#35; ====================
output_dir: ./Visage-V2-PT-1-SFT-2-DPO-1
logging_steps: 2
save_safetensors: true
<br>
&#35; ====================
&#35; WANDB TRACKING
&#35; ====================
wandb_project: Visage-V2-DPO
&#35; wandb_entity: your_entity
wandb_name: Visage-V2-PT-1-SFT-2-DPO-1</code></pre>
          </div>
        </details>
      </div>
    </div>
  </div>
</div>
</body>
</html>