Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

Nanobit

winglian commited on Jan 10, 2024

Commit

043c386

unverified ·

1 Parent(s): 0f10080

fix: `train_on_inputs: true` ignored for sharegpt (#1045) [skip ci]

Browse files

* fix: `train_on_inputs: true` ignored for sharegpt

* enable unit test for train_on_inputs for sharegpt

---------

Co-authored-by: Wing Lian <[email protected]>

Files changed (2) hide show

src/axolotl/prompt_tokenizers.py +16 -8
tests/prompt_strategies/test_sharegpt.py +28 -28

src/axolotl/prompt_tokenizers.py CHANGED Viewed

@@ -379,10 +379,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                         add_eos_token=False,
                         strip_bos_token=True,
                     )
-                    # everything from this is masked out from the labels
-                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                 elif assistant in role:
-                    # TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
                     role = (
                         role.replace(role_remap[1]["from"], role_remap[1]["to"])
                         if role_remap
@@ -406,18 +408,24 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                         add_eos_token=False,
                         strip_bos_token=True,
                     )
-                    # not masked out from labels
                     labels = copy.deepcopy(res["input_ids"])
-                    len_role = len(role_res["input_ids"])
-                    labels[:len_role] = [IGNORE_TOKEN_ID] * min(len_role, len(labels))
                 elif role == "":
                     turn = content
                     # this is only ever the first part, should include the bos token and the user query
                     res = self._tokenize(
                         turn, add_eos_token=False, strip_bos_token=False
                     )
-                    # everything from this is masked out from the labels
-                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                 else:
                     LOG.warning(f"unhandled role: {role}")
                     continue

                         add_eos_token=False,
                         strip_bos_token=True,
                     )
+                    if self.train_on_inputs:
+                        labels = copy.deepcopy(res["input_ids"])
+                    else:
+                        # everything from this is masked out from the labels
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                 elif assistant in role:
                     role = (
                         role.replace(role_remap[1]["from"], role_remap[1]["to"])
                         if role_remap
                         add_eos_token=False,
                         strip_bos_token=True,
                     )
                     labels = copy.deepcopy(res["input_ids"])
+                    if not self.train_on_inputs:
+                        # mask out role tokens from the labels
+                        len_role = len(role_res["input_ids"])
+                        labels[:len_role] = [IGNORE_TOKEN_ID] * min(
+                            len_role, len(labels)
+                        )
                 elif role == "":
                     turn = content
                     # this is only ever the first part, should include the bos token and the user query
                     res = self._tokenize(
                         turn, add_eos_token=False, strip_bos_token=False
                     )
+                    if self.train_on_inputs:
+                        labels = copy.deepcopy(res["input_ids"])
+                    else:
+                        # everything from this is masked out from the labels
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                 else:
                     LOG.warning(f"unhandled role: {role}")
                     continue

tests/prompt_strategies/test_sharegpt.py CHANGED Viewed

@@ -104,7 +104,7 @@ class TestSharegpt:
                 role_key_human=None,
             ),
             tokenizer,
-            True,  # train_on_inputs
             2048,  # sequence_len
         )
@@ -124,30 +124,30 @@ class TestSharegpt:
         ]
         # fmt: on
-    # def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
-    #     strategy = SimpleShareGPTPromptTokenizingStrategy(
-    #         ShareGPTPrompterV2(
-    #             conversation="chatml",
-    #             role_key_model=None,
-    #             role_key_human=None,
-    #         ),
-    #         tokenizer,
-    #         False,  # train_on_inputs
-    #         2048,  # sequence_len
-    #     )
-    #
-    #     dataset_wrapper = TokenizedPromptDataset(
-    #         strategy, sharegpt_dataset, process_count=1
-    #     )
-    #
-    #     labels = dataset_wrapper[0]["labels"]
-    #     # fmt: off
-    #     assert labels == [
-    #         1,   # bos
-    #         32001, 1587, 13, 25997, 32000, 28705, 13,  # system
-    #         32001, 2188, 13, 21558, 32000, 28705, 13,  # human
-    #         32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
-    #         32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
-    #         32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
-    #     ]
-    #     # fmt: on

                 role_key_human=None,
             ),
             tokenizer,
+            False,  # train_on_inputs
             2048,  # sequence_len
         )
         ]
         # fmt: on
+    def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            True,  # train_on_inputs
+            2048,  # sequence_len
+        )
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset, process_count=1
+        )
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            1,   # bos
+            32001, 1587, 13, 25997, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 32000, 28705, 13,  # human
+            32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
+            32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
+        ]
+        # fmt: on