Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

winglian commited on Jun 11, 2023

Commit

aac4b76

1 Parent(s): f31a338

add new sharegpt, refactor prompt so it can be customized later, add exception if no data is processed

Browse files

Files changed (6) hide show

README.md +9 -1
src/axolotl/datasets.py +4 -0
src/axolotl/prompt_strategies/sharegpt_jokes.py +28 -0
src/axolotl/prompt_strategies/sharegpt_simple.py +21 -0
src/axolotl/prompters.py +19 -14
src/axolotl/utils/data.py +9 -2

README.md CHANGED Viewed

@@ -219,6 +219,14 @@ Have dataset(s) in one of the following format (JSONL recommended):
   ```json
   {"conversations": [{"role": "...", "value": "..."}]}
   ```
 </details>
@@ -530,7 +538,7 @@ Try set `fp16: true`
 Try to turn off xformers.
-## Need help? 🙋‍♂️
 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you

   ```json
   {"conversations": [{"role": "...", "value": "..."}]}
   ```
+- `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
+- `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+  ```json
+  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+  ```
 </details>
 Try to turn off xformers.
+## Need help? 🙋♂️
 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you

src/axolotl/datasets.py CHANGED Viewed

@@ -33,12 +33,16 @@ class TokenizedPromptDataset(IterableDataset):
     def __iter__(self):
         iterator = iter(self.dataset)
         # Loop through the entire dataset
         for example in iterator:
             try:
                 yield self.prompt_tokenizer.tokenize_prompt(example)
             except InvalidDataException:
                 pass
 # TODO this isn't the best since it can't interleave datasets

     def __iter__(self):
         iterator = iter(self.dataset)
+        count = 0
         # Loop through the entire dataset
         for example in iterator:
             try:
                 yield self.prompt_tokenizer.tokenize_prompt(example)
+                count += 1
             except InvalidDataException:
                 pass
+        if count == 0:
+            raise RuntimeError("Expected at least one datapoint in dataset.")
 # TODO this isn't the best since it can't interleave datasets

src/axolotl/prompt_strategies/sharegpt_jokes.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Module for Jokes prompts using sharegpt style """
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import PromptStyle, ShareGPTPrompter
+def load(tokenizer, cfg):
+    return SimpleJokesShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+class SimpleJokesShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    Tokenization strategy for asking bot to tell a joke and then explain why its funny
+    """
+    # title, text, explanation
+    def get_conversation_thread(self, prompt):
+        title = "" if not prompt["title"] else prompt["title"] + " "
+        return [
+            {"from": "human", "value": "Tell me a joke."},
+            {"from": "gpt", "value": title + prompt["text"]},
+            {"from": "human", "value": "Why is that joke funny?"},
+            {"from": "gpt", "value": prompt["explanation"]},
+        ]

src/axolotl/prompt_strategies/sharegpt_simple.py CHANGED Viewed

@@ -13,6 +13,15 @@ def load(tokenizer, cfg):
     )
 def load_guanaco(tokenizer, cfg):
     return GuanacoShareGPTPromptTokenizingStrategy(
         ShareGPTPrompter(PromptStyle.CHAT.value),
@@ -31,6 +40,18 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
         return prompt["conversations"]
 class GuanacoShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
     """
     sharegpt strategy that remaps oasst data to sharegpt format

     )
+def load_role(tokenizer, cfg):
+    return SimpleRoleShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
 def load_guanaco(tokenizer, cfg):
     return GuanacoShareGPTPromptTokenizingStrategy(
         ShareGPTPrompter(PromptStyle.CHAT.value),
         return prompt["conversations"]
+class SimpleRoleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row, but uses role instead of from
+    """
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap role: prompter/assistant, text: ... => from: human/gpt, value: ...
+        turns = [{"from": t["role"], "value": t["value"]} for t in conversations]
+        return turns
 class GuanacoShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
     """
     sharegpt strategy that remaps oasst data to sharegpt format

src/axolotl/prompters.py CHANGED Viewed

@@ -261,28 +261,33 @@ class Conversation:
         self.messages.append([role, message])
-conv_vicuna_v1_1 = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=["USER", "ASSISTANT"],
-    messages=[],
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2=" ",
-)
 class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
     """
     A prompter that generates prompts for the ShareGPT
     """
-    def __init__(self, prompt_style=None):
         if prompt_style != PromptStyle.CHAT.value:
             raise ValueError(
                 f"unsupported prompt_style for ShareGPTPrompter({prompt_style})"
             )
     # def match_prompt_style(self):
     #     if self.prompt_style == PromptStyle.chat.value:
@@ -300,7 +305,7 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
             # also happens on the data splitting leaving empty conversations
             raise IndexError
-        conv = conv_vicuna_v1_1.copy()
         roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
         try:

         self.messages.append([role, message])
 class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
     """
     A prompter that generates prompts for the ShareGPT
     """
+    def __init__(self, prompt_style=None, system_prompt=None):
         if prompt_style != PromptStyle.CHAT.value:
             raise ValueError(
                 f"unsupported prompt_style for ShareGPTPrompter({prompt_style})"
             )
+        system = (
+            system_prompt
+            if system_prompt
+            else (
+                "A chat between a curious user and an artificial intelligence assistant. "
+                "The assistant gives helpful, detailed, and polite answers to the user's questions."
+            )
+        )
+        self._conversation = Conversation(
+            system=system,
+            roles=["USER", "ASSISTANT"],
+            messages=[],
+            offset=0,
+            sep_style=SeparatorStyle.TWO,
+            sep=" ",
+            sep2=" ",
+        )
     # def match_prompt_style(self):
     #     if self.prompt_style == PromptStyle.chat.value:
             # also happens on the data splitting leaving empty conversations
             raise IndexError
+        conv = self._conversation.copy()
         roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
         try:

src/axolotl/utils/data.py CHANGED Viewed

@@ -239,8 +239,15 @@ def load_tokenized_prepared_datasets(
                 ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                 datasets.append(ds_wrapper)
             else:
-                logging.error(f"unhandled prompt tokenization strategy: {d.type}")
-                raise ValueError(f"unhandled prompt tokenization strategy: {d.type}")
         logging.info("tokenizing, merging, and shuffling master dataset")
         samples: List[int] = []

                 ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                 datasets.append(ds_wrapper)
             else:
+                suffix = ""
+                if ":load_" in d.type:
+                    suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
+                logging.error(
+                    f"unhandled prompt tokenization strategy: {d.type}. {suffix}"
+                )
+                raise ValueError(
+                    f"unhandled prompt tokenization strategy: {d.type} {suffix}"
+                )
         logging.info("tokenizing, merging, and shuffling master dataset")
         samples: List[int] = []