Blair1213 commited on
Commit
6957bb5
·
0 Parent(s):

Upload MedTok tokenizer

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. code2embeddings.json +3 -0
  3. code2tokens.json +3 -0
  4. tokenizer.py +123 -0
  5. vocab.json +3 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.json filter=lfs diff=lfs merge=lfs -text
code2embeddings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e366d6fb34335cc71c5ee55a8268a834705756564f13de6eae338a5a5ee1fb6a
3
+ size 4077987302
code2tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:349afbcdbc8371e7683e5151d33a6a1dbd6f4ebfd4b0e87a28f40445c08352ab
3
+ size 185630676
tokenizer.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ class MedTok(PreTrainedTokenizer):
8
+ def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
9
+ print(f"Loading vocab from: {vocab_file}")
10
+ print(f"Loading token map from: {code2tokens_file}")
11
+
12
+ with open(vocab_file, "r") as f:
13
+ self.vocab = json.load(f)
14
+
15
+ with open(code2tokens_file, "r") as f:
16
+ self.code2tok = json.load(f)
17
+
18
+ with open(embedding_file, 'r') as f:
19
+ self.code2emb = json.load(f)
20
+
21
+ self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
22
+ self.tokens_to_ids = self.vocab # alias
23
+
24
+ super().__init__(**kwargs)
25
+
26
+ # ---------- required interface ----------
27
+ def _tokenize(self, text):
28
+ if text in self.code2tok:
29
+ return self.code2tok[text]
30
+ return self._infer_and_register(text)
31
+
32
+ def embed(self, text):
33
+ tokens = self._tokenize(text) # 先分词
34
+ if text in self.code2emb:
35
+ return self.code2emb[text]
36
+ return ids
37
+
38
+ def encode(self, text):
39
+ tokens = self._tokenize(text) # 先分词
40
+ return tokens
41
+
42
+ def _convert_token_to_id(self, token):
43
+ return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
44
+
45
+ def _convert_id_to_token(self, idx):
46
+ return self.ids_to_tokens.get(idx, self.unk_token)
47
+
48
+ def get_vocab(self):
49
+ return self.vocab
50
+
51
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
52
+ """
53
+ Build model inputs from a sequence or a pair of sequences by adding special tokens.
54
+ token_ids_0: list[int] — input ids for the first sequence
55
+ token_ids_1: Optional[list[int]] — input ids for the second sequence (if any)
56
+ """
57
+ if token_ids_1 is None:
58
+ return token_ids_0
59
+ else:
60
+ return token_ids_0 + token_ids_1
61
+
62
+
63
+ def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
64
+ return [0] * len(token_ids)
65
+
66
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
67
+ if token_ids_1 is None:
68
+ return [0] * len(token_ids_0)
69
+ return [0] * len(token_ids_0) + [1] * len(token_ids_1)
70
+
71
+ # ---------- dynamic extension interface (disabled) ----------
72
+ def _infer_and_register(self, code, code_desc="This is a medical code"):
73
+ raise NotImplementedError("Dynamic token generation is disabled in this version.")
74
+
75
+ # ---------- saving ----------
76
+ def save_updates(self, out_vocab="vocab.json", out_map="code2tokens.json"):
77
+ json.dump(self.vocab, open(out_vocab, "w"), indent=2)
78
+ json.dump(self.code2tok, open(out_map, "w"), indent=2)
79
+
80
+ def save_pretrained(self, save_directory):
81
+ import os, json
82
+ os.makedirs(save_directory, exist_ok=True)
83
+ with open(os.path.join(save_directory, "vocab.json"), "w") as f:
84
+ json.dump(self.vocab, f, indent=2)
85
+ with open(os.path.join(save_directory, "code2tokens.json"), "w") as f:
86
+ json.dump(self.code2tok, f, indent=2)
87
+ tokenizer_config = {
88
+ "tokenizer_class": "MedTok",
89
+ "vocab_file": "vocab.json",
90
+ "code2tokens_file": "code2tokens.json",
91
+ "code2embedding_file": "code2embeddings.json",
92
+ }
93
+ with open(os.path.join(save_directory, "tokenizer_config.json"), "w") as f:
94
+ json.dump(tokenizer_config, f, indent=2)
95
+
96
+ @classmethod
97
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
98
+ import os, json
99
+ vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
100
+ code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
101
+ return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, **kwargs)
102
+
103
+
104
+
105
+ '''vocab_path = "vocab.json"
106
+ token_path = "code2tokens.json"
107
+ embedding_path = "code2embeddings.json"
108
+
109
+ tokenizer = MedTok(
110
+ vocab_file=vocab_path,
111
+ code2tokens_file=token_path,
112
+ embedding_file=embedding_path,
113
+ unk_token='[UNK]',
114
+ pad_token='[PAD]',
115
+ )
116
+
117
+ tokens = tokenizer.tokenize("E11.9")
118
+ ids = tokenizer.encode("E11.9")
119
+ embed = tokenizer.embed("E11.9")
120
+ print("Tokens:", tokens)
121
+ print("Token IDs:", ids)
122
+ print("Decoded:", tokenizer.decode(tokens))
123
+ print("Embedding:", embed)'''
vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:111028fccba182d08ae1191edffe338a12a65504bcc10b92e6b7775634328e78
3
+ size 523814