johannhartmann
commited on
Upload folder using huggingface_hub
Browse files- README.md +192 -0
- config.json +33 -0
- model.safetensors +3 -0
- special_tokens_map.json +24 -0
- tokenizer.model +3 -0
- tokenizer_config.json +322 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- de
|
4 |
+
tags:
|
5 |
+
- german
|
6 |
+
- causal-lm
|
7 |
+
- text-generation
|
8 |
+
library_name: transformers
|
9 |
+
pipeline_tag: text-generation
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
# BΓΌbleLM
|
14 |
+
|
15 |
+
|
16 |
+
<div align="center" style="margin-bottom: 2rem; margin-top: 2rem">
|
17 |
+
<img src="https://pieter.ai/resources/buble-logo.png" alt="BΓΌbleLM Logo" style="max-height: 450px; width: auto;"/>
|
18 |
+
<h1 style="margin-top: 1rem;">BΓΌbleLM</h1>
|
19 |
+
<p><em>A small German LM</em></p>
|
20 |
+
</div>
|
21 |
+
|
22 |
+
BΓΌbleLM is a German language model based on Gemma-2-2B, adapted using [trans-tokenization](https://pieter.ai/trans-tokenization/) with a custom German SentencePiece tokenizer. The model demonstrates how language-specific tokenization can significantly improve performance while maintaining the base model's capabilities.
|
23 |
+
|
24 |
+
## Model Details
|
25 |
+
|
26 |
+
- **Architecture**: Based on Gemma-2B decoder-only architecture
|
27 |
+
- **Parameters**: 2 billion
|
28 |
+
- **Tokenizer**: Custom German SentencePiece tokenizer (20k vocabulary)
|
29 |
+
- Fertility rate: 1.78 tokens per word
|
30 |
+
- Optimized for German morphological structures
|
31 |
+
- Trained on the same corpus as the model
|
32 |
+
- **Context Length**: 8192 tokens
|
33 |
+
- **Training Hardware**: Single node with 4x NVidia A100-SXM4-80GB GPUs
|
34 |
+
|
35 |
+
## Training Data
|
36 |
+
|
37 |
+
Trained on 3.5B tokens from Occiglot-FineWeb project, including:
|
38 |
+
- Contemporary web content (OSCAR 2015-2023)
|
39 |
+
- Legislative documents (EurLex, ParlamInt)
|
40 |
+
- News data (Tagesschau)
|
41 |
+
- Wiki sources
|
42 |
+
|
43 |
+
Data sampling weights:
|
44 |
+
- Wikipedia: 4x
|
45 |
+
- News/Parliamentary: 2x
|
46 |
+
- Other sources: 1x
|
47 |
+
|
48 |
+
## Performance
|
49 |
+
|
50 |
+
Key improvements over Gemma-2-2B baseline:
|
51 |
+
- HellaSwag-DE: +71% (47.9% vs 28.0%)
|
52 |
+
- ARC-DE: +41% (32.3% vs 22.9%)
|
53 |
+
- Average zero-shot: +40% (35.8% vs 25.5%)
|
54 |
+
|
55 |
+
β BΓΌbleLM-2B consistently outperforms both the base Gemma-2-2B and other German models like LLΓ€Mmlein-1B across most tasks.
|
56 |
+
|
57 |
+
<table class="model-comparison">
|
58 |
+
<thead>
|
59 |
+
<tr>
|
60 |
+
<th align="left">Model</th>
|
61 |
+
<th align="center" colspan="2">ARC-DE</th>
|
62 |
+
<th align="center" colspan="2">HellaSwag-DE</th>
|
63 |
+
<th align="center">TruthfulQA-DE</th>
|
64 |
+
<th align="center">Average</th>
|
65 |
+
</tr>
|
66 |
+
<tr>
|
67 |
+
<th></th>
|
68 |
+
<th align="center">0-shot</th>
|
69 |
+
<th align="center">3-shot</th>
|
70 |
+
<th align="center">0-shot</th>
|
71 |
+
<th align="center">3-shot</th>
|
72 |
+
<th align="center">0-shot</th>
|
73 |
+
<th align="center">0-shot</th>
|
74 |
+
</tr>
|
75 |
+
</thead>
|
76 |
+
<tbody>
|
77 |
+
<tr>
|
78 |
+
<td><a href="https://huggingface.co/google/gemma-2-2b" target="_blank">Gemma-2-2B</a></td>
|
79 |
+
<td align="center">22.9</td>
|
80 |
+
<td align="center">23.1</td>
|
81 |
+
<td align="center">28.0</td>
|
82 |
+
<td align="center">27.6</td>
|
83 |
+
<td align="center">25.5</td>
|
84 |
+
<td align="center">25.5</td>
|
85 |
+
</tr>
|
86 |
+
<tr>
|
87 |
+
<td><a href="https://huggingface.co/LSX-UniWue/LLaMmlein_120M" target="_blank">LLΓ€Mmlein-120M</a></td>
|
88 |
+
<td align="center">24.7 β+8%</td>
|
89 |
+
<td align="center">-</td>
|
90 |
+
<td align="center">32.0 β+14%</td>
|
91 |
+
<td align="center">-</td>
|
92 |
+
<td align="center">25.0 β-2%</td>
|
93 |
+
<td align="center">27.2 β+7%</td>
|
94 |
+
</tr>
|
95 |
+
<tr>
|
96 |
+
<td><a href="https://huggingface.co/LSX-UniWue/LLaMmlein_1B" target="_blank">LLΓ€Mmlein-1B</a></td>
|
97 |
+
<td align="center">30.0 β+31%</td>
|
98 |
+
<td align="center">-</td>
|
99 |
+
<td align="center"><strong>48.5</strong> β+73%</td>
|
100 |
+
<td align="center">-</td>
|
101 |
+
<td align="center">23.4 β-8%</td>
|
102 |
+
<td align="center">34.0 β+33%</td>
|
103 |
+
</tr>
|
104 |
+
<tr>
|
105 |
+
<td><a href="https://huggingface.co/VAGOsolutions/SauerkrautLM-Gemma-2b" target="_blank">Sauerkraut-Gemma-2B</a></td>
|
106 |
+
<td align="center">28.0 β+22%</td>
|
107 |
+
<td align="center">34.6 β+50%</td>
|
108 |
+
<td align="center">37.2 β+33%</td>
|
109 |
+
<td align="center">44.1 β+60%</td>
|
110 |
+
<td align="center"><strong>32.9</strong> β+29%</td>
|
111 |
+
<td align="center">32.7 β+28%</td>
|
112 |
+
</tr>
|
113 |
+
<tr>
|
114 |
+
<td><strong>BΓΌbleLM (Ours)</strong></td>
|
115 |
+
<td align="center"><strong>32.3</strong> β+41%</td>
|
116 |
+
<td align="center"><strong>35.2</strong> β+52%</td>
|
117 |
+
<td align="center">47.9 β+71%</td>
|
118 |
+
<td align="center"><strong>46.6</strong> β+69%</td>
|
119 |
+
<td align="center">27.2 β+7%</td>
|
120 |
+
<td align="center"><strong>35.8</strong> β+40%</td>
|
121 |
+
</tr>
|
122 |
+
</tbody>
|
123 |
+
</table>
|
124 |
+
|
125 |
+
*Performance evaluated on German versions of ARC (knowledge-based QA), HellaSwag (commonsense reasoning), and TruthfulQA (truthfulness). Values show accuracy in percentages, with arrows indicating relative improvement over Gemma-2B baseline. Best results shown in bold.*
|
126 |
+
|
127 |
+
## Safety & Ethics
|
128 |
+
|
129 |
+
### Toxicity
|
130 |
+
- Perplexity: 52.97 on German TextDetox dataset
|
131 |
+
- Toxic content appears more out-of-distribution compared to baseline
|
132 |
+
|
133 |
+
### Gender Bias
|
134 |
+
- Evaluated using perplexity differences between traditional and gender-inclusive forms
|
135 |
+
- Slight preference for gender-inclusive language (not statistically significant)
|
136 |
+
- Example: "Lehrer" vs "Lehrer*innen" (βPPL = -9.61)
|
137 |
+
|
138 |
+
|
139 |
+
## Usage
|
140 |
+
|
141 |
+
**Note**: This is a base language model, not an instruction-tuned model. It is not optimized for chat or instruction following. For best results, use standard text completion rather than chat templates.
|
142 |
+
|
143 |
+
Also make sure you have the sentencepiece tokenizer installed:
|
144 |
+
|
145 |
+
```bash
|
146 |
+
pip install sentencepiece
|
147 |
+
```
|
148 |
+
|
149 |
+
```python
|
150 |
+
from transformers import pipeline
|
151 |
+
pipe = pipeline("text-generation", model="flair/bueble-lm-2b")
|
152 |
+
pipe("Ich bin")
|
153 |
+
```
|
154 |
+
|
155 |
+
Or with the full model api:
|
156 |
+
|
157 |
+
```python
|
158 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
159 |
+
|
160 |
+
tokenizer = AutoTokenizer.from_pretrained("flair/bueble-lm-2b")
|
161 |
+
model = AutoModelForCausalLM.from_pretrained(
|
162 |
+
"flair/bueble-lm-2b",
|
163 |
+
device_map="auto",
|
164 |
+
torch_dtype=torch.bfloat16
|
165 |
+
)
|
166 |
+
|
167 |
+
# Basic text completion
|
168 |
+
text = "Berlin ist eine Stadt, die"
|
169 |
+
inputs = tokenizer(text, return_tensors="pt").to("cuda")
|
170 |
+
outputs = model.generate(**inputs, max_new_tokens=256)
|
171 |
+
print(tokenizer.decode(outputs[0]))
|
172 |
+
```
|
173 |
+
|
174 |
+
For instruction-tuning experiments or chat applications, we recommend fine-tuning the model first with appropriate German instruction datasets.
|
175 |
+
|
176 |
+
|
177 |
+
## Limitations
|
178 |
+
|
179 |
+
- Limited vocabulary size (20k tokens) compared to multilingual models (250k for Gemma)
|
180 |
+
- Performance may vary on specialized domains not well-represented in training data
|
181 |
+
- Higher fertility rate (1.78) due to smaller vocabulary size
|
182 |
+
- Inherits base limitations from Gemma architecture
|
183 |
+
|
184 |
+
## Citation
|
185 |
+
|
186 |
+
```bibtex
|
187 |
+
@article{delobelle2024buble,
|
188 |
+
title={BΓΌbleLM: A small German LM},
|
189 |
+
author={Delobelle, Pieter and Akbik, Alan and others},
|
190 |
+
year={2024}
|
191 |
+
}
|
192 |
+
```
|
config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "pdelobelle/gemma-2-2b-de",
|
3 |
+
"architectures": [
|
4 |
+
"Gemma2ForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"attn_logit_softcapping": 50.0,
|
9 |
+
"bos_token_id": 2,
|
10 |
+
"cache_implementation": "hybrid",
|
11 |
+
"eos_token_id": 2,
|
12 |
+
"final_logit_softcapping": 30.0,
|
13 |
+
"head_dim": 256,
|
14 |
+
"hidden_act": "gelu_pytorch_tanh",
|
15 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
16 |
+
"hidden_size": 2304,
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"intermediate_size": 9216,
|
19 |
+
"max_position_embeddings": 8192,
|
20 |
+
"model_type": "gemma2",
|
21 |
+
"num_attention_heads": 8,
|
22 |
+
"num_hidden_layers": 26,
|
23 |
+
"num_key_value_heads": 4,
|
24 |
+
"pad_token_id": 3,
|
25 |
+
"query_pre_attn_scalar": 256,
|
26 |
+
"rms_norm_eps": 1e-06,
|
27 |
+
"rope_theta": 10000.0,
|
28 |
+
"sliding_window": 4096,
|
29 |
+
"torch_dtype": "bfloat16",
|
30 |
+
"transformers_version": "4.44.1",
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 20000
|
33 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:986191e4e7d52df000fc1b3b15c3d44e59bb2123e4a33ec2831b91532bb61fc1
|
3 |
+
size 4141229384
|
special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "</s>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<unk>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16e8773affbd03448ffb79173feed1884514012160d3074641ee402dcad4f481
|
3 |
+
size 579378
|
tokenizer_config.json
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
},
|
30 |
+
"3": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": true
|
37 |
+
},
|
38 |
+
"4": {
|
39 |
+
"content": "ββ",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": false,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": false
|
45 |
+
},
|
46 |
+
"5": {
|
47 |
+
"content": "βββ",
|
48 |
+
"lstrip": false,
|
49 |
+
"normalized": false,
|
50 |
+
"rstrip": false,
|
51 |
+
"single_word": false,
|
52 |
+
"special": false
|
53 |
+
},
|
54 |
+
"6": {
|
55 |
+
"content": "ββββ",
|
56 |
+
"lstrip": false,
|
57 |
+
"normalized": false,
|
58 |
+
"rstrip": false,
|
59 |
+
"single_word": false,
|
60 |
+
"special": false
|
61 |
+
},
|
62 |
+
"7": {
|
63 |
+
"content": "ββββββββ",
|
64 |
+
"lstrip": false,
|
65 |
+
"normalized": false,
|
66 |
+
"rstrip": false,
|
67 |
+
"single_word": false,
|
68 |
+
"special": false
|
69 |
+
},
|
70 |
+
"8": {
|
71 |
+
"content": "ββββββββββββββββ",
|
72 |
+
"lstrip": false,
|
73 |
+
"normalized": false,
|
74 |
+
"rstrip": false,
|
75 |
+
"single_word": false,
|
76 |
+
"special": false
|
77 |
+
},
|
78 |
+
"9": {
|
79 |
+
"content": "--",
|
80 |
+
"lstrip": false,
|
81 |
+
"normalized": false,
|
82 |
+
"rstrip": false,
|
83 |
+
"single_word": false,
|
84 |
+
"special": false
|
85 |
+
},
|
86 |
+
"10": {
|
87 |
+
"content": "----",
|
88 |
+
"lstrip": false,
|
89 |
+
"normalized": false,
|
90 |
+
"rstrip": false,
|
91 |
+
"single_word": false,
|
92 |
+
"special": false
|
93 |
+
},
|
94 |
+
"11": {
|
95 |
+
"content": "-----",
|
96 |
+
"lstrip": false,
|
97 |
+
"normalized": false,
|
98 |
+
"rstrip": false,
|
99 |
+
"single_word": false,
|
100 |
+
"special": false
|
101 |
+
},
|
102 |
+
"12": {
|
103 |
+
"content": "--------",
|
104 |
+
"lstrip": false,
|
105 |
+
"normalized": false,
|
106 |
+
"rstrip": false,
|
107 |
+
"single_word": false,
|
108 |
+
"special": false
|
109 |
+
},
|
110 |
+
"13": {
|
111 |
+
"content": "----------------",
|
112 |
+
"lstrip": false,
|
113 |
+
"normalized": false,
|
114 |
+
"rstrip": false,
|
115 |
+
"single_word": false,
|
116 |
+
"special": false
|
117 |
+
},
|
118 |
+
"14": {
|
119 |
+
"content": "++",
|
120 |
+
"lstrip": false,
|
121 |
+
"normalized": false,
|
122 |
+
"rstrip": false,
|
123 |
+
"single_word": false,
|
124 |
+
"special": false
|
125 |
+
},
|
126 |
+
"15": {
|
127 |
+
"content": "/**",
|
128 |
+
"lstrip": false,
|
129 |
+
"normalized": false,
|
130 |
+
"rstrip": false,
|
131 |
+
"single_word": false,
|
132 |
+
"special": false
|
133 |
+
},
|
134 |
+
"16": {
|
135 |
+
"content": "***",
|
136 |
+
"lstrip": false,
|
137 |
+
"normalized": false,
|
138 |
+
"rstrip": false,
|
139 |
+
"single_word": false,
|
140 |
+
"special": false
|
141 |
+
},
|
142 |
+
"17": {
|
143 |
+
"content": "****",
|
144 |
+
"lstrip": false,
|
145 |
+
"normalized": false,
|
146 |
+
"rstrip": false,
|
147 |
+
"single_word": false,
|
148 |
+
"special": false
|
149 |
+
},
|
150 |
+
"18": {
|
151 |
+
"content": "******",
|
152 |
+
"lstrip": false,
|
153 |
+
"normalized": false,
|
154 |
+
"rstrip": false,
|
155 |
+
"single_word": false,
|
156 |
+
"special": false
|
157 |
+
},
|
158 |
+
"19": {
|
159 |
+
"content": "********",
|
160 |
+
"lstrip": false,
|
161 |
+
"normalized": false,
|
162 |
+
"rstrip": false,
|
163 |
+
"single_word": false,
|
164 |
+
"special": false
|
165 |
+
},
|
166 |
+
"20": {
|
167 |
+
"content": "**/",
|
168 |
+
"lstrip": false,
|
169 |
+
"normalized": false,
|
170 |
+
"rstrip": false,
|
171 |
+
"single_word": false,
|
172 |
+
"special": false
|
173 |
+
},
|
174 |
+
"21": {
|
175 |
+
"content": "##",
|
176 |
+
"lstrip": false,
|
177 |
+
"normalized": false,
|
178 |
+
"rstrip": false,
|
179 |
+
"single_word": false,
|
180 |
+
"special": false
|
181 |
+
},
|
182 |
+
"22": {
|
183 |
+
"content": "###",
|
184 |
+
"lstrip": false,
|
185 |
+
"normalized": false,
|
186 |
+
"rstrip": false,
|
187 |
+
"single_word": false,
|
188 |
+
"special": false
|
189 |
+
},
|
190 |
+
"23": {
|
191 |
+
"content": "<|im_start|>",
|
192 |
+
"lstrip": false,
|
193 |
+
"normalized": false,
|
194 |
+
"rstrip": false,
|
195 |
+
"single_word": false,
|
196 |
+
"special": false
|
197 |
+
},
|
198 |
+
"24": {
|
199 |
+
"content": "<|im_end|>",
|
200 |
+
"lstrip": false,
|
201 |
+
"normalized": false,
|
202 |
+
"rstrip": false,
|
203 |
+
"single_word": false,
|
204 |
+
"special": false
|
205 |
+
},
|
206 |
+
"25": {
|
207 |
+
"content": "<|system|>",
|
208 |
+
"lstrip": false,
|
209 |
+
"normalized": false,
|
210 |
+
"rstrip": false,
|
211 |
+
"single_word": false,
|
212 |
+
"special": false
|
213 |
+
},
|
214 |
+
"26": {
|
215 |
+
"content": "<|user|>",
|
216 |
+
"lstrip": false,
|
217 |
+
"normalized": false,
|
218 |
+
"rstrip": false,
|
219 |
+
"single_word": false,
|
220 |
+
"special": false
|
221 |
+
},
|
222 |
+
"27": {
|
223 |
+
"content": "<|assistant|>",
|
224 |
+
"lstrip": false,
|
225 |
+
"normalized": false,
|
226 |
+
"rstrip": false,
|
227 |
+
"single_word": false,
|
228 |
+
"special": false
|
229 |
+
},
|
230 |
+
"28": {
|
231 |
+
"content": "ββ",
|
232 |
+
"lstrip": false,
|
233 |
+
"normalized": false,
|
234 |
+
"rstrip": false,
|
235 |
+
"single_word": false,
|
236 |
+
"special": false
|
237 |
+
},
|
238 |
+
"29": {
|
239 |
+
"content": "ββ",
|
240 |
+
"lstrip": false,
|
241 |
+
"normalized": false,
|
242 |
+
"rstrip": false,
|
243 |
+
"single_word": false,
|
244 |
+
"special": false
|
245 |
+
},
|
246 |
+
"30": {
|
247 |
+
"content": "β",
|
248 |
+
"lstrip": false,
|
249 |
+
"normalized": false,
|
250 |
+
"rstrip": false,
|
251 |
+
"single_word": false,
|
252 |
+
"special": false
|
253 |
+
},
|
254 |
+
"31": {
|
255 |
+
"content": "β",
|
256 |
+
"lstrip": false,
|
257 |
+
"normalized": false,
|
258 |
+
"rstrip": false,
|
259 |
+
"single_word": false,
|
260 |
+
"special": false
|
261 |
+
},
|
262 |
+
"32": {
|
263 |
+
"content": "β",
|
264 |
+
"lstrip": false,
|
265 |
+
"normalized": false,
|
266 |
+
"rstrip": false,
|
267 |
+
"single_word": false,
|
268 |
+
"special": false
|
269 |
+
},
|
270 |
+
"33": {
|
271 |
+
"content": "β",
|
272 |
+
"lstrip": false,
|
273 |
+
"normalized": false,
|
274 |
+
"rstrip": false,
|
275 |
+
"single_word": false,
|
276 |
+
"special": false
|
277 |
+
},
|
278 |
+
"34": {
|
279 |
+
"content": "{",
|
280 |
+
"lstrip": false,
|
281 |
+
"normalized": false,
|
282 |
+
"rstrip": false,
|
283 |
+
"single_word": false,
|
284 |
+
"special": false
|
285 |
+
},
|
286 |
+
"35": {
|
287 |
+
"content": "}\"",
|
288 |
+
"lstrip": false,
|
289 |
+
"normalized": false,
|
290 |
+
"rstrip": false,
|
291 |
+
"single_word": false,
|
292 |
+
"special": false
|
293 |
+
},
|
294 |
+
"36": {
|
295 |
+
"content": "{\"",
|
296 |
+
"lstrip": false,
|
297 |
+
"normalized": false,
|
298 |
+
"rstrip": false,
|
299 |
+
"single_word": false,
|
300 |
+
"special": false
|
301 |
+
},
|
302 |
+
"37": {
|
303 |
+
"content": "}",
|
304 |
+
"lstrip": false,
|
305 |
+
"normalized": false,
|
306 |
+
"rstrip": false,
|
307 |
+
"single_word": false,
|
308 |
+
"special": false
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"bos_token": "<s>",
|
312 |
+
"clean_up_tokenization_spaces": false,
|
313 |
+
"eos_token": "</s>",
|
314 |
+
"legacy": true,
|
315 |
+
"model_max_length": 1000000000000000019884624838656,
|
316 |
+
"pad_token": "</s>",
|
317 |
+
"sp_model_kwargs": {},
|
318 |
+
"spaces_between_special_tokens": false,
|
319 |
+
"tokenizer_class": "LlamaTokenizer",
|
320 |
+
"unk_token": "<unk>",
|
321 |
+
"use_default_system_prompt": false
|
322 |
+
}
|
trainer_state.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41a61c806a0bb2deecb47b569453df82a098015e8a830d9b725661463eeec7db
|
3 |
+
size 5304
|