dorami-ai commited on
Commit
ec3279b
·
1 Parent(s): 13f4f86
README.md CHANGED
@@ -1,3 +1,47 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ datasets:
4
+ - google/wiki40b
5
+ language:
6
+ - zh
7
+ base_model:
8
+ - openai-community/gpt2
9
  ---
10
+
11
+ # Dorami
12
+
13
+ A GPT-based pretrained model using the BERT Tokenizer
14
+
15
+ ## Model description
16
+
17
+ ### Training data
18
+
19
+ [google/wiki40b](https://huggingface.co/datasets/google/wiki40b)
20
+
21
+ ### Training code
22
+
23
+ [dorami](https://github.com/6zeus/dorami.git)
24
+
25
+ ## How to use
26
+
27
+ ### 1. Download model from Hugging Face Hub to local
28
+
29
+ ```
30
+ git lfs install
31
+ git clone https://huggingface.co/lucky2me/Dorami
32
+ ```
33
+
34
+ ### 2. Use the model downloaded above
35
+ ```python
36
+ import torch
37
+ from transformers import AutoTokenizer,AutoModelForCausalLM
38
+ model_path = "The path of the model downloaded above"
39
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
40
+ model = AutoModelForCausalLM.from_pretrained(model_path)
41
+ text = "fill in any text you like."
42
+ encoded_input = tokenizer(text, return_tensors='pt')
43
+ output = model(**encoded_input)
44
+ predicted_token_id = torch.argmax(output.logits[:, -1, :], dim=-1)
45
+ decoded_text = tokenizer.decode(predicted_token_id, skip_special_tokens=True)
46
+ print("decoded text:",decoded_text)
47
+ ```
config.json CHANGED
@@ -4,13 +4,13 @@
4
  "GPT2LMHeadModel"
5
  ],
6
  "attn_pdrop": 0.1,
7
- "bos_token_id": 50256,
8
  "embd_pdrop": 0.1,
9
- "eos_token_id": 50256,
10
  "initializer_range": 0.02,
11
  "layer_norm_epsilon": 1e-05,
12
  "model_type": "gpt2",
13
- "n_ctx": 1024,
14
  "n_embd": 768,
15
  "n_head": 12,
16
  "n_inner": null,
 
4
  "GPT2LMHeadModel"
5
  ],
6
  "attn_pdrop": 0.1,
7
+ "bos_token_id": 101,
8
  "embd_pdrop": 0.1,
9
+ "eos_token_id": 102,
10
  "initializer_range": 0.02,
11
  "layer_norm_epsilon": 1e-05,
12
  "model_type": "gpt2",
13
+ "n_ctx": 512,
14
  "n_embd": 768,
15
  "n_head": 12,
16
  "n_inner": null,
generation_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 50256,
4
- "eos_token_id": 50256,
5
  "transformers_version": "4.49.0"
6
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 101,
4
+ "eos_token_id": 102,
5
  "transformers_version": "4.49.0"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4f40987ee851732e91691d258a1d89cd4fb4ec690c45f50ff0428be1f440bfc
3
  size 406717056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b1ddf9e5855df07c92d6ea6d5c8e169237f570492c7b3ac51ace4e926afbf7f
3
  size 406717056
special_tokens_map.json CHANGED
@@ -1,5 +1,7 @@
1
  {
 
2
  "cls_token": "[CLS]",
 
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
5
  "sep_token": "[SEP]",
 
1
  {
2
+ "bos_token": "[CLS]",
3
  "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
  "mask_token": "[MASK]",
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
tokenizer_config.json CHANGED
@@ -41,9 +41,11 @@
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": false,
45
  "cls_token": "[CLS]",
46
  "do_lower_case": false,
 
47
  "extra_special_tokens": {},
48
  "mask_token": "[MASK]",
49
  "model_max_length": 512,
 
41
  "special": true
42
  }
43
  },
44
+ "bos_token": "[CLS]",
45
  "clean_up_tokenization_spaces": false,
46
  "cls_token": "[CLS]",
47
  "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
  "extra_special_tokens": {},
50
  "mask_token": "[MASK]",
51
  "model_max_length": 512,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79c95a763fecaef342a7aa85c8b13b643ac1becff5c72ee12b339f47731ac899
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec831de7b2f8d8e5eed95de7449859762e59b7cfdb28298ea8cda6dab7d02a09
3
  size 5304