commit files to HF hub

Files changed (3) hide show

fasttext_jp_embedding.py +10 -1
fasttext_jp_tokenizer.py +62 -11
mecab_tokenizer.py +2 -0

fasttext_jp_embedding.py CHANGED Viewed

@@ -6,11 +6,18 @@ import torch
 class FastTextJpConfig(PretrainedConfig):
-    """FastTextのEmbeddingを行います。
     """
     model_type = "fasttext_jp"
     def __init__(self, tokenizer_class="FastTextJpTokenizer", **kwargs):
         kwargs["tokenizer_class"] = tokenizer_class
         super().__init__(**kwargs)
@@ -29,5 +36,7 @@ class FastTextJpModel(PreTrainedModel):
         return self.word_embeddings(torch.tensor([0]))
 FastTextJpConfig.register_for_auto_class()
 FastTextJpModel.register_for_auto_class("AutoModel")

 class FastTextJpConfig(PretrainedConfig):
+    """FastTextJpModelのConfig
     """
     model_type = "fasttext_jp"
     def __init__(self, tokenizer_class="FastTextJpTokenizer", **kwargs):
+        """初期化処理
+        Args:
+            tokenizer_class (str, optional):
+                tokenizer_classを指定しないと、pipelineから読み込まれません。
+                config.jsonに記載されます。
+        """
         kwargs["tokenizer_class"] = tokenizer_class
         super().__init__(**kwargs)
         return self.word_embeddings(torch.tensor([0]))
+# AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
+# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
 FastTextJpConfig.register_for_auto_class()
 FastTextJpModel.register_for_auto_class("AutoModel")

fasttext_jp_tokenizer.py CHANGED Viewed

@@ -6,6 +6,16 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 def save_stoi(stoi: dict[str, int], vocab_file: str):
     with open(vocab_file, "w", encoding="utf-8") as writer:
         index = 0
         for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
@@ -18,9 +28,21 @@ def save_stoi(stoi: dict[str, int], vocab_file: str):
 def load_stoi(vocab_file: str) -> dict[str, int]:
     stoi: dict[str, int] = {}
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
         token = token.rstrip("\n")
         stoi[token] = index
@@ -28,8 +50,12 @@ def load_stoi(vocab_file: str) -> dict[str, int]:
 class FastTextJpTokenizer(MeCabTokenizer):
     model_type = "fasttext_jp"
     vocab_files_names = VOCAB_FILES_NAMES
     def __init__(self,
@@ -53,35 +79,58 @@ class FastTextJpTokenizer(MeCabTokenizer):
             )
         self.stoi = load_stoi(vocab_file)
         self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
-        self.v_size = len(self.stoi)
-        # self._auto_map = {
-        #     "AutoTokenizer": ["modeling.FastTextMeCabTokenizer", None]
-        # }
-        # self.init_inputs = ["vocab.txt"]
     @property
     def vocab_size(self) -> int:
         """
-        `int`: Size of the base vocabulary (without the added tokens).
-        """
-        return self.v_size
     def _convert_token_to_id(self, token: str) -> int:
         return self.stoi[token]
     def _convert_id_to_token(self, index: int) -> str:
         return self.itos[index]
     def save_vocabulary(self,
                         save_directory: str,
                         filename_prefix: str | None = None) -> tuple[str]:
-        index = 0
         if os.path.isdir(save_directory):
             vocab_file = os.path.join(
                 save_directory,
                 (filename_prefix + "-" if filename_prefix else "") +
-                "vocab.txt")
         else:
             vocab_file = (filename_prefix +
                           "-" if filename_prefix else "") + save_directory
@@ -89,4 +138,6 @@ class FastTextJpTokenizer(MeCabTokenizer):
         return (vocab_file, )
 FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")

 def save_stoi(stoi: dict[str, int], vocab_file: str):
+    """単語IDの辞書を配列にしてvocab_fileに保存します。
+    Args:
+        stoi (dict[str, int]): 単語IDのマッピング
+        vocab_file (str): 保存するパス
+    Raises:
+        ValueError: IDが途切れているとエラーを起こします。
+    """
     with open(vocab_file, "w", encoding="utf-8") as writer:
         index = 0
         for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
 def load_stoi(vocab_file: str) -> dict[str, int]:
+    """ファイルから単語IDの辞書をロードします。
+    Args:
+        vocab_file (str): ファイルのパス
+    Returns:
+        dict[str, int]: 単語IDのマッピング
+    """
     stoi: dict[str, int] = {}
+    # ファイルから読み出し
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
+    # 単語IDのマッピングを生成します。
     for index, token in enumerate(tokens):
         token = token.rstrip("\n")
         stoi[token] = index
 class FastTextJpTokenizer(MeCabTokenizer):
+    # Configが認識するのに必要です。
+    # https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration
     model_type = "fasttext_jp"
+    # vocab.txtを認識するのにおそらく必要。
     vocab_files_names = VOCAB_FILES_NAMES
     def __init__(self,
             )
         self.stoi = load_stoi(vocab_file)
         self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
     @property
     def vocab_size(self) -> int:
+        """ボキャブラリのサイズ
+        ※PreTrainedTokenizerで実装すべき必須の関数。
+        Returns:
+            int: ボキャブラリのサイズ
         """
+        return len(self.stoi)
     def _convert_token_to_id(self, token: str) -> int:
+        """単語からID
+        ※PreTrainedTokenizerで実装すべき必須の関数。
+        Args:
+            token (str): 単語
+        Returns:
+            int: ID
+        """
         return self.stoi[token]
     def _convert_id_to_token(self, index: int) -> str:
+        """IDから単語
+        ※PreTrainedTokenizerで実装すべき必須の関数。
+        Args:
+            index (int): ID
+        Returns:
+            str: 単語
+        """
         return self.itos[index]
     def save_vocabulary(self,
                         save_directory: str,
                         filename_prefix: str | None = None) -> tuple[str]:
+        """ボキャブラリの保存
+        Args:
+            save_directory (str): 保存するディレクトリ。ファイル名はvocab.txtに固定
+            filename_prefix (str | None, optional): ファイルのprefix
+        Returns:
+            tuple[str]: ファイル名を返す。
+        """
         if os.path.isdir(save_directory):
             vocab_file = os.path.join(
                 save_directory,
                 (filename_prefix + "-" if filename_prefix else "") +
+                VOCAB_FILES_NAMES["vocab_file"])
         else:
             vocab_file = (filename_prefix +
                           "-" if filename_prefix else "") + save_directory
         return (vocab_file, )
+# AutoTokenizerに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
+# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
 FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")

mecab_tokenizer.py CHANGED Viewed

@@ -5,6 +5,8 @@ from transformers import PreTrainedTokenizer
 class MeCabResult(NamedTuple):
     hyosokei: str
     hinshi: str
     hinshi_saibunrui_1: str

 class MeCabResult(NamedTuple):
+    """MeCab解析結果の型
+    """
     hyosokei: str
     hinshi: str
     hinshi_saibunrui_1: str