paulhindemith
/

fasttext-classification

@@ -34,7 +34,7 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
     def __init__(self, config: FastTextForSeuqenceClassificationConfig):
-        self.ngram = config.ngram
         super().__init__(config)
     def forward(self, **inputs) -> SequenceClassifierOutput:
@@ -58,7 +58,7 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
                                                 attention_mask == 1)]
             candidate_label = output[torch.logical_and(token_type_ids == 1,
                                                        attention_mask == 1)]
-            sentence_words = self.split_ngram(sentence, self.ngram)
             candidate_label_mean = torch.mean(candidate_label,
                                               dim=-2,
                                               keepdim=True)
@@ -76,7 +76,8 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
             self, sentence_words: TensorType["words", "vectors"],
             candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
         res = torch.tensor(0.)
-        for sw in sentence_words:
             p = torch.nn.functional.cosine_similarity(sw,
                                                       candidate_label_means[0],
                                                       dim=0)
@@ -87,6 +88,8 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
     def split_ngram(self, sentences: TensorType["word", "vectors"],
                     n: int) -> TensorType["word", "vectors"]:
         res = []
         for i in range(len(sentences) - n + 1):
             ngram = sentences[i:i + n]
             res.append(torch.mean(ngram, dim=0, keepdim=False))

     def __init__(self, config: FastTextForSeuqenceClassificationConfig):
+        self.max_ngram = config.ngram
         super().__init__(config)
     def forward(self, **inputs) -> SequenceClassifierOutput:
                                                 attention_mask == 1)]
             candidate_label = output[torch.logical_and(token_type_ids == 1,
                                                        attention_mask == 1)]
+            sentence_words = self.split_ngram(sentence, self.max_ngram)
             candidate_label_mean = torch.mean(candidate_label,
                                               dim=-2,
                                               keepdim=True)
             self, sentence_words: TensorType["words", "vectors"],
             candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
         res = torch.tensor(0.)
+        for i in range(len(sentence_words)):
+            sw = sentence_words[i]
             p = torch.nn.functional.cosine_similarity(sw,
                                                       candidate_label_means[0],
                                                       dim=0)
     def split_ngram(self, sentences: TensorType["word", "vectors"],
                     n: int) -> TensorType["word", "vectors"]:
         res = []
+        if len(sentences) <= n:
+            return torch.stack([torch.mean(sentences, dim=0, keepdim=False)])
         for i in range(len(sentences) - n + 1):
             ngram = sentences[i:i + n]
             res.append(torch.mean(ngram, dim=0, keepdim=False))

fasttext_jp_embedding.py CHANGED Viewed

@@ -11,14 +11,26 @@ class FastTextJpConfig(PretrainedConfig):
     """
     model_type = "fasttext_jp"
-    def __init__(self, tokenizer_class="FastTextJpTokenizer", **kwargs):
         """初期化処理
         Args:
             tokenizer_class (str, optional):
                 tokenizer_classを指定しないと、pipelineから読み込まれません。
                 config.jsonに記載されます。
         """
         kwargs["tokenizer_class"] = tokenizer_class
         super().__init__(**kwargs)

     """
     model_type = "fasttext_jp"
+    def __init__(self,
+                 vocab_size=1,
+                 hidden_size=1,
+                 tokenizer_class="FastTextJpTokenizer",
+                 **kwargs):
         """初期化処理
         Args:
             tokenizer_class (str, optional):
                 tokenizer_classを指定しないと、pipelineから読み込まれません。
                 config.jsonに記載されます。
+            vocab_size (str, optional):
+                vocab_sizeを指定しないと、pipelineから読み込まれません。
+                config.jsonに記載されます。
+            hidden_size (str, optional):
+                hidden_sizeを指定しないと、pipelineから読み込まれません。
+                config.jsonに記載されます。
         """
+        kwargs["vocab_size"] = vocab_size
+        kwargs["hidden_size"] = hidden_size
         kwargs["tokenizer_class"] = tokenizer_class
         super().__init__(**kwargs)

mecab_tokenizer.py CHANGED Viewed

@@ -12,6 +12,8 @@ class MeCabResult(NamedTuple):
 class MeCabTokenizer(PreTrainedTokenizer):
     def __init__(self,
                  hinshi: list[str] | None = None,

 class MeCabTokenizer(PreTrainedTokenizer):
+    target_hinshi: list[str] | None
+    mecab: MeCab.Tagger
     def __init__(self,
                  hinshi: list[str] | None = None,