zxdu20 commited on
Commit
3946e1b
1 Parent(s): 758ba9f

Update README.md

Browse files
Files changed (3) hide show
  1. README.md +2 -0
  2. tokenization_chatglm.py +12 -12
  3. tokenizer_config.json +2 -2
README.md CHANGED
@@ -8,6 +8,8 @@ tags:
8
  - thudm
9
  ---
10
  # ChatGLM-6B
 
 
11
  ## 介绍
12
  ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。结合模型量化技术,用户可以在消费级的显卡上进行本地部署(INT4 量化级别下最低只需 6GB 显存)。ChatGLM-6B 使用了和 [ChatGLM](https://chatglm.cn) 相同的技术,针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练,辅以监督微调、反馈自助、人类反馈强化学习等技术的加持,62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答。
13
 
 
8
  - thudm
9
  ---
10
  # ChatGLM-6B
11
+ **本仓库已经不再维护,请使用 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4)**
12
+
13
  ## 介绍
14
  ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。结合模型量化技术,用户可以在消费级的显卡上进行本地部署(INT4 量化级别下最低只需 6GB 显存)。ChatGLM-6B 使用了和 [ChatGLM](https://chatglm.cn) 相同的技术,针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练,辅以监督微调、反馈自助、人类反馈强化学习等技术的加持,62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答。
15
 
tokenization_chatglm.py CHANGED
@@ -171,8 +171,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
171
  do_lower_case=False,
172
  remove_space=False,
173
  bos_token='<sop>',
174
- eos_token='</s>',
175
- eop_token='<eop>',
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
@@ -185,7 +185,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
185
  padding_side=padding_side,
186
  bos_token=bos_token,
187
  eos_token=eos_token,
188
- eop_token=eop_token,
189
  mask_token=mask_token,
190
  gmask_token=gmask_token,
191
  num_image_tokens=num_image_tokens,
@@ -198,7 +198,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
198
 
199
  self.bos_token = bos_token
200
  self.eos_token = eos_token
201
- self.eop_token = eop_token
202
  self.mask_token = mask_token
203
  self.gmask_token = gmask_token
204
 
@@ -213,14 +213,14 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
213
  return self.convert_tokens_to_ids(self.gmask_token)
214
 
215
  @property
216
- def eop_token_id(self) -> Optional[int]:
217
  """
218
- `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
219
  set.
220
  """
221
- if self.eop_token is None:
222
  return None
223
- return self.convert_tokens_to_ids(self.eop_token)
224
 
225
  @property
226
  def vocab_size(self):
@@ -324,18 +324,18 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
324
  """
325
  mask_ids = self.sp_tokenizer[self.mask_token]
326
  gmask_ids = self.sp_tokenizer[self.gmask_token]
327
- eop_id = self.sp_tokenizer[self.eop_token]
328
  if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
329
  token_ids_0 += [gmask_ids]
330
 
331
  if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
332
- token_ids_0 += [self.sp_tokenizer[self.eos_token]]
333
 
334
  token_ids_0 += [self.sp_tokenizer[self.bos_token]]
335
 
336
  if token_ids_1 is not None:
337
- if not token_ids_1 or token_ids_1[-1] != eop_id:
338
- token_ids_1 += [eop_id]
339
  token_ids_0 += token_ids_1
340
 
341
  return token_ids_0
 
171
  do_lower_case=False,
172
  remove_space=False,
173
  bos_token='<sop>',
174
+ eos_token='<eop>',
175
+ end_token='</s>',
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
 
185
  padding_side=padding_side,
186
  bos_token=bos_token,
187
  eos_token=eos_token,
188
+ end_token=end_token,
189
  mask_token=mask_token,
190
  gmask_token=gmask_token,
191
  num_image_tokens=num_image_tokens,
 
198
 
199
  self.bos_token = bos_token
200
  self.eos_token = eos_token
201
+ self.end_token = end_token
202
  self.mask_token = mask_token
203
  self.gmask_token = gmask_token
204
 
 
213
  return self.convert_tokens_to_ids(self.gmask_token)
214
 
215
  @property
216
+ def end_token_id(self) -> Optional[int]:
217
  """
218
+ `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
219
  set.
220
  """
221
+ if self.end_token is None:
222
  return None
223
+ return self.convert_tokens_to_ids(self.end_token)
224
 
225
  @property
226
  def vocab_size(self):
 
324
  """
325
  mask_ids = self.sp_tokenizer[self.mask_token]
326
  gmask_ids = self.sp_tokenizer[self.gmask_token]
327
+ eos_id = self.sp_tokenizer[self.eos_token]
328
  if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
329
  token_ids_0 += [gmask_ids]
330
 
331
  if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
332
+ token_ids_0 += [self.sp_tokenizer[self.end_token]]
333
 
334
  token_ids_0 += [self.sp_tokenizer[self.bos_token]]
335
 
336
  if token_ids_1 is not None:
337
+ if not token_ids_1 or token_ids_1[-1] != eos_id:
338
+ token_ids_1 += [eos_id]
339
  token_ids_0 += token_ids_1
340
 
341
  return token_ids_0
tokenizer_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "name_or_path": "THUDM/chatglm-6b",
3
  "bos_token": "<sop>",
4
- "eop_token": "<eop>",
5
- "eos_token": "</s>",
6
  "gmask_token": "[gMASK]",
7
  "mask_token": "[MASK]",
8
  "pad_token": "<pad>",
 
1
  {
2
  "name_or_path": "THUDM/chatglm-6b",
3
  "bos_token": "<sop>",
4
+ "eos_token": "<eop>",
5
+ "end_token": "</s>",
6
  "gmask_token": "[gMASK]",
7
  "mask_token": "[MASK]",
8
  "pad_token": "<pad>",