zxdu20 commited on
Commit
c808a4e
1 Parent(s): ee32c34

Slim embedding

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "THUDM/chatglm-6b",
3
  "architectures": [
4
  "ChatGLMForConditionalGeneration"
5
  ],
@@ -8,21 +8,23 @@
8
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
  },
11
- "bos_token_id": 150004,
12
- "eos_token_id": 150005,
13
- "pad_token_id": 20003,
14
  "hidden_size": 4096,
15
  "inner_hidden_size": 16384,
16
  "layernorm_epsilon": 1e-05,
 
17
  "max_sequence_length": 2048,
18
  "model_type": "chatglm",
19
  "num_attention_heads": 32,
20
  "num_layers": 28,
 
21
  "position_encoding_2d": true,
22
  "quantization_bit": 4,
23
  "quantization_embeddings": true,
24
  "torch_dtype": "float16",
25
- "transformers_version": "4.26.1",
26
  "use_cache": true,
27
- "vocab_size": 150528
28
- }
 
1
  {
2
+ "_name_or_path": "THUDM/chatglm-6b-int4-qe",
3
  "architectures": [
4
  "ChatGLMForConditionalGeneration"
5
  ],
 
8
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
  },
11
+ "bos_token_id": 130004,
12
+ "eos_token_id": 130005,
13
+ "gmask_token_id": 130001,
14
  "hidden_size": 4096,
15
  "inner_hidden_size": 16384,
16
  "layernorm_epsilon": 1e-05,
17
+ "mask_token_id": 130000,
18
  "max_sequence_length": 2048,
19
  "model_type": "chatglm",
20
  "num_attention_heads": 32,
21
  "num_layers": 28,
22
+ "pad_token_id": 3,
23
  "position_encoding_2d": true,
24
  "quantization_bit": 4,
25
  "quantization_embeddings": true,
26
  "torch_dtype": "float16",
27
+ "transformers_version": "4.27.1",
28
  "use_cache": true,
29
+ "vocab_size": 130528
30
+ }
configuration_chatglm.py CHANGED
@@ -66,6 +66,8 @@ class ChatGLMConfig(PretrainedConfig):
66
  use_cache=False,
67
  bos_token_id=150004,
68
  eos_token_id=150005,
 
 
69
  pad_token_id=0,
70
  max_sequence_length=2048,
71
  inner_hidden_size=16384,
@@ -87,6 +89,8 @@ class ChatGLMConfig(PretrainedConfig):
87
  self.bos_token_id = bos_token_id
88
  self.eos_token_id = eos_token_id
89
  self.pad_token_id = pad_token_id
 
 
90
  self.position_encoding_2d = position_encoding_2d
91
  self.quantization_bit = quantization_bit
92
  self.quantization_embeddings = quantization_embeddings
 
66
  use_cache=False,
67
  bos_token_id=150004,
68
  eos_token_id=150005,
69
+ mask_token_id=150000,
70
+ gmask_token_id=150001,
71
  pad_token_id=0,
72
  max_sequence_length=2048,
73
  inner_hidden_size=16384,
 
89
  self.bos_token_id = bos_token_id
90
  self.eos_token_id = eos_token_id
91
  self.pad_token_id = pad_token_id
92
+ self.mask_token_id = mask_token_id
93
+ self.gmask_token_id = gmask_token_id
94
  self.position_encoding_2d = position_encoding_2d
95
  self.quantization_bit = quantization_bit
96
  self.quantization_embeddings = quantization_embeddings
modeling_chatglm.py CHANGED
@@ -921,9 +921,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
921
 
922
 
923
  if position_ids is None:
924
- MASK, gMASK = 150000, 150001
925
- mask_token = MASK if MASK in input_ids else gMASK
926
- use_gmask = False if MASK in input_ids else True
927
 
928
  mask_positions = [seq.tolist().index(mask_token) for seq in input_ids]
929
  position_ids = self.get_position_ids(
@@ -1084,9 +1084,9 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1084
  **kwargs
1085
  ) -> dict:
1086
  batch_size, seq_length = input_ids.shape
1087
- MASK, gMASK = 150000, 150001
1088
- mask_token = MASK if MASK in input_ids else gMASK
1089
- use_gmask = False if MASK in input_ids else True
1090
  seqs = input_ids.tolist()
1091
  mask_positions = [seq.index(mask_token) for seq in seqs]
1092
 
@@ -1408,6 +1408,11 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1408
 
1409
  self.transformer = quantize(self.transformer, bits, use_quantization_cache=use_quantization_cache, empty_init=empty_init, **kwargs)
1410
 
 
 
 
 
 
1411
  if quantize_embeddings:
1412
  logger.info("Applying quantization to embeddings")
1413
  self.transformer.word_embeddings = QuantizedEmbedding(
@@ -1415,11 +1420,11 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1415
  weight_tensor=self.transformer.word_embeddings.weight.to(self.device),
1416
  num_embeddings=self.transformer.word_embeddings.num_embeddings,
1417
  embedding_dim=self.transformer.word_embeddings.embedding_dim,
1418
- dtype=torch.half,
1419
- empty_init=True,
1420
  device=self.transformer.word_embeddings.weight.device,
1421
  )
1422
- self.lm_head = QuantizedLinear(
1423
  weight_bit_width=bits,
1424
  weight_tensor=self.lm_head.weight.to(self.device),
1425
  bias_tensor=None,
@@ -1428,8 +1433,8 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1428
  bias=False,
1429
  quantized_weight=self.transformer.word_embeddings.weight,
1430
  quantized_weight_scale=self.transformer.word_embeddings.weight_scale,
1431
- dtype=torch.half,
1432
- empty_init=True,
1433
  device=self.lm_head.weight.device,
1434
  )
1435
 
 
921
 
922
 
923
  if position_ids is None:
924
+ MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
925
+ mask_token = gMASK if gMASK in input_ids else MASK
926
+ use_gmask = True if gMASK in input_ids else False
927
 
928
  mask_positions = [seq.tolist().index(mask_token) for seq in input_ids]
929
  position_ids = self.get_position_ids(
 
1084
  **kwargs
1085
  ) -> dict:
1086
  batch_size, seq_length = input_ids.shape
1087
+ MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
1088
+ mask_token = gMASK if gMASK in input_ids else MASK
1089
+ use_gmask = True if gMASK in input_ids else False
1090
  seqs = input_ids.tolist()
1091
  mask_positions = [seq.index(mask_token) for seq in seqs]
1092
 
 
1408
 
1409
  self.transformer = quantize(self.transformer, bits, use_quantization_cache=use_quantization_cache, empty_init=empty_init, **kwargs)
1410
 
1411
+ if self.device == torch.device("cpu"):
1412
+ dtype = torch.float32
1413
+ else:
1414
+ dtype = torch.half
1415
+
1416
  if quantize_embeddings:
1417
  logger.info("Applying quantization to embeddings")
1418
  self.transformer.word_embeddings = QuantizedEmbedding(
 
1420
  weight_tensor=self.transformer.word_embeddings.weight.to(self.device),
1421
  num_embeddings=self.transformer.word_embeddings.num_embeddings,
1422
  embedding_dim=self.transformer.word_embeddings.embedding_dim,
1423
+ dtype=dtype,
1424
+ empty_init=empty_init,
1425
  device=self.transformer.word_embeddings.weight.device,
1426
  )
1427
+ self.lm_head = QuantizedLinear(
1428
  weight_bit_width=bits,
1429
  weight_tensor=self.lm_head.weight.to(self.device),
1430
  bias_tensor=None,
 
1433
  bias=False,
1434
  quantized_weight=self.transformer.word_embeddings.weight,
1435
  quantized_weight_scale=self.transformer.word_embeddings.weight_scale,
1436
+ dtype=dtype,
1437
+ empty_init=empty_init,
1438
  device=self.lm_head.weight.device,
1439
  )
1440
 
quantization.py CHANGED
@@ -369,7 +369,7 @@ class QuantizedEmbedding(Embedding): # TODO: backward, check empty_init
369
  )
370
  self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
371
  else:
372
- self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
373
  self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
374
  if weight_bit_width == 4:
375
  self.weight = compress_int4_weight(self.weight)
 
369
  )
370
  self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
371
  else:
372
+ self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).to(kwargs["dtype"])
373
  self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
374
  if weight_bit_width == 4:
375
  self.weight = compress_int4_weight(self.weight)
tokenization_chatglm.py CHANGED
@@ -48,11 +48,13 @@ class SPTokenizer:
48
  def __init__(
49
  self,
50
  vocab_file,
 
51
  max_blank_length=80,
52
  byte_fallback=True,
53
  ):
54
  assert vocab_file is not None
55
  self.vocab_file = vocab_file
 
56
  self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
57
  self.max_blank_length = max_blank_length
58
  self.byte_fallback = byte_fallback
@@ -70,10 +72,6 @@ class SPTokenizer:
70
  def get_tab_token():
71
  return f"<|tab|>"
72
 
73
- @property
74
- def num_image_tokens(self):
75
- return 20000
76
-
77
  @property
78
  def num_text_tokens(self):
79
  return self.text_tokenizer.num_tokens
@@ -178,6 +176,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
178
  mask_token='[MASK]',
179
  gmask_token='[gMASK]',
180
  padding_side="left",
 
181
  **kwargs
182
  ) -> None:
183
  super().__init__(
@@ -197,10 +196,16 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
197
  self.mask_token = mask_token
198
  self.gmask_token = gmask_token
199
 
200
- self.sp_tokenizer = SPTokenizer(vocab_file)
201
 
202
  """ Initialisation """
203
 
 
 
 
 
 
 
204
  @property
205
  def eop_token_id(self) -> Optional[int]:
206
  """
 
48
  def __init__(
49
  self,
50
  vocab_file,
51
+ num_image_tokens=20000,
52
  max_blank_length=80,
53
  byte_fallback=True,
54
  ):
55
  assert vocab_file is not None
56
  self.vocab_file = vocab_file
57
+ self.num_image_tokens = num_image_tokens
58
  self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
59
  self.max_blank_length = max_blank_length
60
  self.byte_fallback = byte_fallback
 
72
  def get_tab_token():
73
  return f"<|tab|>"
74
 
 
 
 
 
75
  @property
76
  def num_text_tokens(self):
77
  return self.text_tokenizer.num_tokens
 
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
179
+ num_image_tokens=20000,
180
  **kwargs
181
  ) -> None:
182
  super().__init__(
 
196
  self.mask_token = mask_token
197
  self.gmask_token = gmask_token
198
 
199
+ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
200
 
201
  """ Initialisation """
202
 
203
+ @property
204
+ def gmask_token_id(self) -> Optional[int]:
205
+ if self.gmask_token is None:
206
+ return None
207
+ return self.convert_tokens_to_ids(self.gmask_token)
208
+
209
  @property
210
  def eop_token_id(self) -> Optional[int]:
211
  """
tokenizer_config.json CHANGED
@@ -10,6 +10,7 @@
10
  "remove_space": false,
11
  "do_lower_case": false,
12
  "tokenizer_class": "ChatGLMTokenizer",
 
13
  "auto_map": {
14
  "AutoTokenizer": [
15
  "tokenization_chatglm.ChatGLMTokenizer",
 
10
  "remove_space": false,
11
  "do_lower_case": false,
12
  "tokenizer_class": "ChatGLMTokenizer",
13
+ "num_image_tokens": 0,
14
  "auto_map": {
15
  "AutoTokenizer": [
16
  "tokenization_chatglm.ChatGLMTokenizer",