|
--- |
|
base_model: llm-jp/llm-jp-3-13b |
|
datasets: |
|
- llm-jp/databricks-dolly-15k-ja |
|
- kanhatakeyama/wizardlm8x22b-logical-math-coding-sft_additional-ja |
|
- kanhatakeyama/AutoMultiTurnByCalm3-22B |
|
- kanhatakeyama/ramdom-to-fixed-multiturn-Calm3 |
|
language: |
|
- en |
|
license: apache-2.0 |
|
tags: |
|
- text-generation-inference |
|
- transformers |
|
- unsloth |
|
- llama |
|
- trl |
|
--- |
|
|
|
# Uploaded model |
|
|
|
- **Developed by:** tsuzukia |
|
- **License:** CC-BY-NC-SA |
|
- **Finetuned from model :** llm-jp/llm-jp-3-13b |
|
|
|
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. |
|
|
|
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth) |
|
|
|
このモデルは、CC-BY-NC-SAライセンスを含むデータセットを使用しているため、CC-BY-NC-SAのライセンスの下で提供されています。 |
|
データセットごとのライセンスは以下の通りです: |
|
|
|
モデル:apache 2.0 |
|
|
|
データセット1(LLMのための日本語インストラクションデータ): CC-BY-NC-SA |
|
|
|
データセット2(llm-jp/databricks-dolly-15k-ja): CC-BY-SA-3.0 |
|
|
|
データセット4(kanhatakeyama/wizardlm8x22b-logical-math-coding-sft_additional-ja): apache-2.0 |
|
|
|
データセット5(kanhatakeyama/AutoMultiTurnByCalm3-22B): apache 2.0/cc-by-sa-3.0/CC0/cc-by-4.0 |
|
|
|
データセット6(kanhatakeyama/ramdom-to-fixed-multiturn-Calm3): apache 2.0 |
|
|
|
## Uses |
|
松尾研「LLM 2024」最終課題用モデル |
|
|
|
実行の仕方は以下の通りです。 |
|
|
|
サンプルコードで公開されていたModel_Inference_Template_unsloth_20241127.ipynbのデータセットを増やしたものになります。 |
|
|
|
実行環境はローカルで、VRAM16GBのGPUで約4日間の学習です。 |
|
|
|
```python |
|
# llm-jp/llm-jp-3-13bを4bit量子化のqLoRA設定でロード。 |
|
|
|
from unsloth import FastLanguageModel |
|
import torch |
|
max_seq_length = 512 # unslothではRoPEをサポートしているのでコンテキスト長は自由に設定可能 |
|
dtype = None # Noneにしておけば自動で設定 |
|
load_in_4bit = True # 今回は13Bモデルを扱うためTrue |
|
|
|
model_id = "llm-jp/llm-jp-3-13b" |
|
new_model_id = "llm-jp-3-13b-it" #Fine-Tuningしたモデルにつけたい名前、it: Instruction Tuning |
|
|
|
# FastLanguageModel インスタンスを作成 |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=model_id, |
|
dtype=dtype, |
|
load_in_4bit=load_in_4bit, |
|
trust_remote_code=True, |
|
) |
|
|
|
# SFT用のモデルを用意 |
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r = 32, |
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj",], |
|
lora_alpha = 32, |
|
lora_dropout = 0.05, |
|
bias = "none", |
|
use_gradient_checkpointing = "unsloth", |
|
random_state = 3407, |
|
use_rslora = False, |
|
loftq_config = None, |
|
max_seq_length = max_seq_length, |
|
) |
|
|
|
``` |
|
### LLMのための日本語インストラクションデータ |
|
```python |
|
from datasets import load_dataset |
|
|
|
import json |
|
import ast |
|
from datasets import Dataset, DatasetDict |
|
import pandas as pd |
|
paths = [ |
|
"Distribution20241221_all/ichikara-instruction-003-001-1.json", |
|
"Distribution20241221_all/ichikara-instruction-003-001-2.1.json", |
|
"Distribution20241221_all/ichikara-instruction-003-001-2.2.json", |
|
"Distribution20241221_all/ichikara-instruction-003-001-5.1.json", |
|
"Distribution20241221_all/ichikara-instruction-003-001-5.2.json", |
|
|
|
] |
|
|
|
from datasets import load_dataset, concatenate_datasets |
|
|
|
datasets_list = [] |
|
for path in paths: |
|
dataset = load_dataset('json', data_files=path) |
|
datasets_list.append(dataset['train']) |
|
combined_train = concatenate_datasets(datasets_list) |
|
dataset1 = DatasetDict({ |
|
'train': combined_train |
|
}) |
|
``` |
|
### llm-jp/databricks-dolly-15k-ja |
|
```python |
|
dataset2 = load_dataset("llm-jp/databricks-dolly-15k-ja") |
|
dataset2 = dataset2.rename_column('instruction', 'text') |
|
dataset2 = dataset2.rename_column('response', 'output') |
|
``` |
|
### kanhatakeyama/wizardlm8x22b-logical-math-coding-sft_additional-ja |
|
```python |
|
def extract_content(example): |
|
# messagesから必要な内容を抽出 |
|
text = example['messages'][0]['content'] # userのcontent |
|
output = example['messages'][1]['content'] # assistantのcontent |
|
|
|
return { |
|
'text': text, |
|
'output': output |
|
} |
|
|
|
dataset4 = load_dataset("kanhatakeyama/wizardlm8x22b-logical-math-coding-sft_additional-ja") |
|
dataset4 = dataset4.map(extract_content) |
|
dataset4 = dataset4.select_columns(['text', 'output']) |
|
``` |
|
|
|
### kanhatakeyama/AutoMultiTurnByCalm3-22B |
|
|
|
```python |
|
dataset5 = load_dataset("kanhatakeyama/AutoMultiTurnByCalm3-22B") |
|
dataset5 = dataset5.rename_column('q1', 'text') |
|
dataset5 = dataset5.rename_column('a1', 'output') |
|
dataset5 = dataset5.select_columns(['text', 'output']) |
|
``` |
|
|
|
### kanhatakeyama/ramdom-to-fixed-multiturn-Calm3 |
|
```python |
|
from datasets import load_dataset, DatasetDict |
|
def extract_content(example): |
|
# messagesから必要な内容を抽出 |
|
text = example['messages'][0]['content'] # userのcontent |
|
output = example['messages'][1]['content'] # assistantのcontent |
|
return { |
|
'text': text, |
|
'output': output |
|
} |
|
|
|
dataset6 = load_dataset("kanhatakeyama/ramdom-to-fixed-multiturn-Calm3") |
|
dataset6 = dataset6.rename_column('text', 'text_old') |
|
dataset6 = DatasetDict({ |
|
'train': dataset6['20240806filtered'] |
|
}) |
|
dataset6 = dataset6.map(extract_content) |
|
dataset6 = dataset6.select_columns(['text', 'output']) |
|
``` |
|
```python |
|
from datasets import concatenate_datasets |
|
|
|
dataset = DatasetDict({ |
|
'train': concatenate_datasets([ |
|
dataset1['train'], |
|
dataset2['train'], |
|
# dataset3['train'], |
|
dataset4['train'], |
|
dataset5['train'], |
|
dataset6['train'], |
|
]) |
|
}) |
|
dataset |
|
``` |
|
```python |
|
# 学習時のプロンプトフォーマットの定義 |
|
prompt = """### 指示 |
|
{} |
|
### 回答 |
|
{}""" |
|
|
|
|
|
""" |
|
formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる |
|
""" |
|
prompt_with_context = """### 指示 |
|
コンテキストを参考に答えてください。 |
|
{} |
|
### コンテキスト |
|
{} |
|
### 回答 |
|
{}""" |
|
|
|
|
|
""" |
|
formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる |
|
""" |
|
EOS_TOKEN = tokenizer.eos_token # トークナイザーのEOSトークン(文末トークン) |
|
def formatting_prompts_func(examples): |
|
input = examples["text"] # 入力データ |
|
output = examples["output"] # 出力データ |
|
try: |
|
context = examples["context"] |
|
except KeyError: |
|
context = None |
|
if context: |
|
text = prompt_with_context.format(input, context,output) + EOS_TOKEN # プロンプトの作成 |
|
else: |
|
text = prompt.format(input, output) + EOS_TOKEN # プロンプトの作成 |
|
return { "formatted_text" : text, } # 新しいフィールド "formatted_text" を返す |
|
pass |
|
|
|
# # 各データにフォーマットを適用 |
|
dataset = dataset.map( |
|
formatting_prompts_func, |
|
num_proc= 4, # 並列処理数を指定 |
|
) |
|
``` |
|
```python |
|
from trl import SFTTrainer |
|
from transformers import TrainingArguments |
|
from unsloth import is_bfloat16_supported |
|
|
|
trainer = SFTTrainer( |
|
model = model, |
|
tokenizer = tokenizer, |
|
train_dataset=dataset["train"], |
|
# eval_dataset=dataset["test"], |
|
max_seq_length = max_seq_length, |
|
dataset_text_field="formatted_text", |
|
packing = False, |
|
args = TrainingArguments( |
|
per_device_train_batch_size = 16, |
|
gradient_accumulation_steps = 4, |
|
gradient_checkpointing = True, |
|
num_train_epochs = 2, |
|
logging_steps = 5, |
|
warmup_steps = 50, |
|
save_steps=100, |
|
save_total_limit=2, |
|
max_steps=-1, |
|
learning_rate = 2e-4, |
|
fp16 = not is_bfloat16_supported(), |
|
bf16 = is_bfloat16_supported(), |
|
group_by_length=True, |
|
# seed = 3407, |
|
output_dir = "outputs", |
|
report_to = "wandb", |
|
run_name="matsuo-lab", |
|
), |
|
) |
|
#@title 学習実行 |
|
trainer_stats = trainer.train() |
|
``` |