model: add test files and support command line arguments

Browse files

Files changed (9) hide show

.gitattributes +1 -0
claude_tokenizer.py +36 -11
golden_ratio_1_million_digits.txt +0 -0
golden_ratio_1_million_digits.txt.tokens +3 -0
groups_merged-enhancedV3.txt +0 -0
groups_merged-enhancedV3.txt.tokens +0 -0
prompt_test.txt +16 -0
prompt_test.txt.tokens +637 -0
tokenize.py +163 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+golden_ratio_1_million_digits.txt.tokens filter=lfs diff=lfs merge=lfs -text

claude_tokenizer.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 from typing import List, Dict
 class ClaudeTokenizer:
-    def __init__(self, config_file: str):
         with open(config_file, "r") as f:
             config = json.load(f)
@@ -22,6 +22,10 @@ class ClaudeTokenizer:
         self.pat = re.compile(self.pat_str)
         self.vocab_trie = self._build_trie(self.vocab)
     def _build_trie(self, vocab: List[str]) -> Dict:
         trie = {}
         for token in vocab:
@@ -37,10 +41,13 @@ class ClaudeTokenizer:
         return trie
     def tokenize(self, text: str) -> List[str]:
-        tokens = []
-        for part in self.pat.findall(text):
-            tokens.extend(self._tokenize_part(part))
-        return tokens
     def encode(self, text: str) -> List[int]:
         tokens = self.tokenize(text)
@@ -52,7 +59,7 @@ class ClaudeTokenizer:
     def decode(self, ids: List[int]) -> str:
         return "".join(self.id_to_token.get(id, "") for id in ids)
-    def _tokenize_part(self, text: str) -> List[str]:
         tokens = []
         while text:
             current = self.vocab_trie
@@ -65,7 +72,22 @@ class ClaudeTokenizer:
                     longest_match = current["*"]
             if longest_match:
                 tokens.append(longest_match)
-                text = text[len(longest_match) :]
             else:
                 tokens.append(text[0])
                 text = text[1:]
@@ -74,14 +96,17 @@ class ClaudeTokenizer:
 # Usage example
 if __name__ == "__main__":
-    tokenizer = ClaudeTokenizer("tokenizer_config.json")
     test_text = """Hello! It's nice to meet you. How can I assist you today? I'm here to help with any questions you might have or tasks you need help with."""
     tokens = tokenizer.tokenize(test_text)
-    print("Tokens:", tokens)
     encoded = tokenizer.encode(test_text)
-    print("Encoded:", encoded)
     decoded = tokenizer.decode(encoded)
-    print("Decoded:", decoded)

 from typing import List, Dict
 class ClaudeTokenizer:
+    def __init__(self, config_file: str, algorithm: str = "trie"):
         with open(config_file, "r") as f:
             config = json.load(f)
         self.pat = re.compile(self.pat_str)
         self.vocab_trie = self._build_trie(self.vocab)
+        self.algorithm = algorithm
+        if algorithm not in ["trie", "linear"]:
+            raise ValueError("Invalid algorithm. Choose 'trie' or 'linear'.")
     def _build_trie(self, vocab: List[str]) -> Dict:
         trie = {}
         for token in vocab:
         return trie
     def tokenize(self, text: str) -> List[str]:
+        if self.algorithm == "trie":
+            tokens = []
+            for part in self.pat.findall(text):
+                tokens.extend(self._tokenize_part_trie(part))
+            return tokens
+        else:
+            return self._tokenize_part_linear(text)
     def encode(self, text: str) -> List[int]:
         tokens = self.tokenize(text)
     def decode(self, ids: List[int]) -> str:
         return "".join(self.id_to_token.get(id, "") for id in ids)
+    def _tokenize_part_trie(self, text: str) -> List[str]:
         tokens = []
         while text:
             current = self.vocab_trie
                     longest_match = current["*"]
             if longest_match:
                 tokens.append(longest_match)
+                text = text[len(longest_match):]
+            else:
+                tokens.append(text[0])
+                text = text[1:]
+        return tokens
+    def _tokenize_part_linear(self, text: str) -> List[str]:
+        tokens = []
+        while text:
+            longest_match = ""
+            for token in self.vocab:
+                if text.startswith(token) and len(token) > len(longest_match):
+                    longest_match = token
+            if longest_match:
+                tokens.append(longest_match)
+                text = text[len(longest_match):]
             else:
                 tokens.append(text[0])
                 text = text[1:]
 # Usage example
 if __name__ == "__main__":
+    # Choose the algorithm: "trie" or "linear"
+    algorithm = "linear"  # or "trie"
+    tokenizer = ClaudeTokenizer("tokenizer_config.json", algorithm=algorithm)
     test_text = """Hello! It's nice to meet you. How can I assist you today? I'm here to help with any questions you might have or tasks you need help with."""
     tokens = tokenizer.tokenize(test_text)
+    print(f"Tokens ({algorithm}):", tokens)
     encoded = tokenizer.encode(test_text)
+    print(f"Encoded ({algorithm}):", encoded)
     decoded = tokenizer.decode(encoded)
+    print(f"Decoded ({algorithm}):", decoded)

golden_ratio_1_million_digits.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

golden_ratio_1_million_digits.txt.tokens ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:609595bf41569a898a1b036eb67f6c2a2213fc84de7099f3e11730073b3d6c04
+size 16842482

groups_merged-enhancedV3.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

groups_merged-enhancedV3.txt.tokens ADDED Viewed

The diff for this file is too large to render. See raw diff

prompt_test.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+/*
+ * Copyright 2000-2009 JetBrains s.r.o.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ Explain this copyright license.

prompt_test.txt.tokens ADDED Viewed

	@@ -0,0 +1,637 @@

+[
+  {
+    "token": "/*",
+    "id": 32125
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " Copyright",
+    "id": 3800
+  },
+  {
+    "token": " 2000",
+    "id": 1453
+  },
+  {
+    "token": "-",
+    "id": 31893
+  },
+  {
+    "token": "2009",
+    "id": 32787
+  },
+  {
+    "token": " Jet",
+    "id": 6203
+  },
+  {
+    "token": "Brain",
+    "id": 35944
+  },
+  {
+    "token": "s",
+    "id": 58607
+  },
+  {
+    "token": " s",
+    "id": 25844
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "r",
+    "id": 57555
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "o",
+    "id": 55302
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " Licensed",
+    "id": 6684
+  },
+  {
+    "token": " under",
+    "id": 28977
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " Apache",
+    "id": 2347
+  },
+  {
+    "token": " License",
+    "id": 6683
+  },
+  {
+    "token": ",",
+    "id": 31833
+  },
+  {
+    "token": " Version",
+    "id": 10644
+  },
+  {
+    "token": " 2",
+    "id": 1450
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "0",
+    "id": 32168
+  },
+  {
+    "token": " ",
+    "id": 411
+  },
+  {
+    "token": "(",
+    "id": 31193
+  },
+  {
+    "token": "the",
+    "id": 60198
+  },
+  {
+    "token": " ",
+    "id": 411
+  },
+  {
+    "token": "\"",
+    "id": 30684
+  },
+  {
+    "token": "License",
+    "id": 39610
+  },
+  {
+    "token": "\");",
+    "id": 30727
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " you",
+    "id": 30159
+  },
+  {
+    "token": " may",
+    "id": 21448
+  },
+  {
+    "token": " not",
+    "id": 22440
+  },
+  {
+    "token": " use",
+    "id": 29209
+  },
+  {
+    "token": " this",
+    "id": 28263
+  },
+  {
+    "token": " file",
+    "id": 17775
+  },
+  {
+    "token": " except",
+    "id": 17281
+  },
+  {
+    "token": " in",
+    "id": 19621
+  },
+  {
+    "token": " compliance",
+    "id": 14422
+  },
+  {
+    "token": " with",
+    "id": 29944
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " License",
+    "id": 6683
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " You",
+    "id": 11050
+  },
+  {
+    "token": " may",
+    "id": 21448
+  },
+  {
+    "token": " obtain",
+    "id": 22612
+  },
+  {
+    "token": " a",
+    "id": 11238
+  },
+  {
+    "token": " copy",
+    "id": 14878
+  },
+  {
+    "token": " of",
+    "id": 22656
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " License",
+    "id": 6683
+  },
+  {
+    "token": " at",
+    "id": 12373
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " http",
+    "id": 19315
+  },
+  {
+    "token": "://",
+    "id": 33845
+  },
+  {
+    "token": "www",
+    "id": 62180
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "apache",
+    "id": 45677
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "org",
+    "id": 56034
+  },
+  {
+    "token": "/",
+    "id": 32099
+  },
+  {
+    "token": "licenses",
+    "id": 53796
+  },
+  {
+    "token": "/",
+    "id": 32099
+  },
+  {
+    "token": "LICENSE",
+    "id": 39458
+  },
+  {
+    "token": "-",
+    "id": 31893
+  },
+  {
+    "token": "2",
+    "id": 32769
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "0",
+    "id": 32168
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " Unless",
+    "id": 10498
+  },
+  {
+    "token": " required",
+    "id": 25350
+  },
+  {
+    "token": " by",
+    "id": 13397
+  },
+  {
+    "token": " applicable",
+    "id": 12089
+  },
+  {
+    "token": " law",
+    "id": 20697
+  },
+  {
+    "token": " or",
+    "id": 22820
+  },
+  {
+    "token": " agreed",
+    "id": 11648
+  },
+  {
+    "token": " to",
+    "id": 28411
+  },
+  {
+    "token": " in",
+    "id": 19621
+  },
+  {
+    "token": " writing",
+    "id": 30057
+  },
+  {
+    "token": ",",
+    "id": 31833
+  },
+  {
+    "token": " software",
+    "id": 26849
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " distributed",
+    "id": 16197
+  },
+  {
+    "token": " under",
+    "id": 28977
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " License",
+    "id": 6683
+  },
+  {
+    "token": " is",
+    "id": 20239
+  },
+  {
+    "token": " distributed",
+    "id": 16197
+  },
+  {
+    "token": " on",
+    "id": 22716
+  },
+  {
+    "token": " an",
+    "id": 11871
+  },
+  {
+    "token": " ",
+    "id": 411
+  },
+  {
+    "token": "\"",
+    "id": 30684
+  },
+  {
+    "token": "AS",
+    "id": 35173
+  },
+  {
+    "token": " IS",
+    "id": 5883
+  },
+  {
+    "token": "\"",
+    "id": 30684
+  },
+  {
+    "token": " BASIS",
+    "id": 2583
+  },
+  {
+    "token": ",",
+    "id": 31833
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " WITHOUT",
+    "id": 10757
+  },
+  {
+    "token": " WARRANTIES",
+    "id": 10735
+  },
+  {
+    "token": " OR",
+    "id": 7753
+  },
+  {
+    "token": " CONDITIONS",
+    "id": 3187
+  },
+  {
+    "token": " OF",
+    "id": 7733
+  },
+  {
+    "token": " ANY",
+    "id": 2014
+  },
+  {
+    "token": " KIND",
+    "id": 6288
+  },
+  {
+    "token": ",",
+    "id": 31833
+  },
+  {
+    "token": " either",
+    "id": 16672
+  },
+  {
+    "token": " express",
+    "id": 17456
+  },
+  {
+    "token": " or",
+    "id": 22820
+  },
+  {
+    "token": " implied",
+    "id": 19580
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " See",
+    "id": 9386
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " License",
+    "id": 6683
+  },
+  {
+    "token": " for",
+    "id": 18039
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " specific",
+    "id": 26985
+  },
+  {
+    "token": " language",
+    "id": 20643
+  },
+  {
+    "token": " governing",
+    "id": 18631
+  },
+  {
+    "token": " permissions",
+    "id": 23414
+  },
+  {
+    "token": " and",
+    "id": 11913
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " *",
+    "id": 1020
+  },
+  {
+    "token": " limitations",
+    "id": 20911
+  },
+  {
+    "token": " under",
+    "id": 28977
+  },
+  {
+    "token": " the",
+    "id": 28194
+  },
+  {
+    "token": " License",
+    "id": 6683
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " */",
+    "id": 1036
+  },
+  {
+    "token": "\n",
+    "id": 38
+  },
+  {
+    "token": " Expl",
+    "id": 4745
+  },
+  {
+    "token": "ain",
+    "id": 45149
+  },
+  {
+    "token": " this",
+    "id": 28263
+  },
+  {
+    "token": " copyright",
+    "id": 14880
+  },
+  {
+    "token": " license",
+    "id": 20861
+  },
+  {
+    "token": ".",
+    "id": 31979
+  },
+  {
+    "total": 158
+  }
+]

tokenize.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import re
+import json
+import argparse
+from typing import List, Dict
+import bisect
+class ClaudeTokenizer:
+    def __init__(self, config_file: str, algorithm: str = "trie"):
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        self.vocab = sorted(config["vocab"])  # Sort vocab for binary search
+        self.vocab_size = config["n_vocab_size"]
+        self.pat_str = config["pat_str"]
+        self.special_tokens = config["special_tokens"]
+        self.token_to_id = {token: i for i, token in enumerate(self.vocab)}
+        self.id_to_token = {i: token for token, i in self.token_to_id.items()}
+        for token, id in self.special_tokens.items():
+            self.token_to_id[token] = id
+            self.id_to_token[id] = token
+        self.pat = re.compile(self.pat_str)
+        self.vocab_trie = self._build_trie(self.vocab)
+        self.algorithm = algorithm
+        if algorithm not in ["trie", "linear"]:
+            raise ValueError("Invalid algorithm. Choose 'trie' or 'linear'.")
+    def _build_trie(self, vocab: List[str]) -> Dict:
+        trie = {}
+        for token in vocab:
+            current = trie
+            for char in token:
+                if isinstance(current, str):
+                    break
+                if char not in current:
+                    current[char] = {}
+                current = current[char]
+            if isinstance(current, dict):
+                current["*"] = token
+        return trie
+    def tokenize(self, text: str) -> List[str]:
+        if self.algorithm == "trie":
+            tokens = []
+            for part in self.pat.findall(text):
+                tokens.extend(self._tokenize_part_trie(part))
+            return tokens
+        else:
+            return self._tokenize_part_linear(text)
+    def encode(self, text: str) -> List[int]:
+        tokens = self.tokenize(text)
+        return [
+            self.token_to_id.get(token, self.special_tokens["<META>"])
+            for token in tokens
+        ]
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.id_to_token.get(id, "") for id in ids)
+    def _tokenize_part_trie(self, text: str) -> List[str]:
+        tokens = []
+        while text:
+            current = self.vocab_trie
+            longest_match = ""
+            for i, char in enumerate(text):
+                if char not in current:
+                    break
+                current = current[char]
+                if "*" in current:
+                    longest_match = current["*"]
+            if longest_match:
+                tokens.append(longest_match)
+                text = text[len(longest_match):]
+            else:
+                tokens.append(text[0])
+                text = text[1:]
+        return tokens
+    def _tokenize_part_linear(self, text: str) -> List[str]:
+        tokens = []
+        while text:
+            longest_match = self._binary_search_prefix(text)
+            if longest_match:
+                tokens.append(longest_match)
+                text = text[len(longest_match):]
+            else:
+                tokens.append(text[0])
+                text = text[1:]
+        return tokens
+    def _binary_search_prefix(self, text: str) -> str:
+        left, right = 0, len(self.vocab) - 1
+        longest_match = ""
+        while left <= right:
+            mid = (left + right) // 2
+            if text.startswith(self.vocab[mid]):
+                longest_match = self.vocab[mid]
+                left = mid + 1
+            elif self.vocab[mid] < text:
+                left = mid + 1
+            else:
+                right = mid - 1
+        return longest_match
+def process_file(file_path: str, tokenizer: ClaudeTokenizer) -> List[Dict]:
+    encodings = ['utf-8', 'utf-16', 'latin-1', 'iso-8859-1']
+    for encoding in encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding) as f:
+                text = f.read()
+            break
+        except UnicodeDecodeError:
+            continue
+    else:
+        raise ValueError(f"Unable to decode the file {file_path} with any of the attempted encodings.")
+    tokens = tokenizer.tokenize(text)
+    encoded = tokenizer.encode(text)
+    result = [{"token": token, "id": id} for token, id in zip(tokens, encoded)]
+    result.append({"total": len(tokens)})
+    return result
+def main():
+    parser = argparse.ArgumentParser(description="Tokenize text using Claude Tokenizer")
+    parser.add_argument("--text", type=str, help="Text to tokenize")
+    parser.add_argument("--file", type=str, help="File to tokenize")
+    parser.add_argument("--algo", type=str, choices=["linear", "trie"], required=True, help="Tokenization algorithm")
+    args = parser.parse_args()
+    if not args.text and not args.file:
+        parser.error("Either --text or --file must be specified")
+    try:
+        tokenizer = ClaudeTokenizer("tokenizer_config.json", algorithm=args.algo)
+        if args.file:
+            result = process_file(args.file, tokenizer)
+            output_file = args.file + ".tokens"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            print(f"Tokenization results saved to {output_file}")
+        else:
+            tokens = tokenizer.tokenize(args.text)
+            encoded = tokenizer.encode(args.text)
+            result = [{"token": token, "id": id} for token, id in zip(tokens, encoded)]
+            result.append({"total": len(tokens)})
+            print(json.dumps(result, indent=2, ensure_ascii=False))
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()