|  | """ | 
					
						
						|  | Prompt strategies loader for alpaca instruction datasets with system prompts | 
					
						
						|  | """ | 
					
						
						|  | from typing import Generator, Tuple, Union | 
					
						
						|  |  | 
					
						
						|  | from axolotl.prompt_tokenizers import PromptTokenizingStrategy | 
					
						
						|  | from axolotl.prompters import AlpacaPrompter, PromptStyle | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy): | 
					
						
						|  | """ | 
					
						
						|  | Tokenizing strategy for instruction-based prompts. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: | 
					
						
						|  | return ( | 
					
						
						|  | prompt["instruction"], | 
					
						
						|  | prompt["input"] if "input" in prompt else "", | 
					
						
						|  | prompt["output"], | 
					
						
						|  | prompt["system"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def tokenize_prompt(self, prompt): | 
					
						
						|  |  | 
					
						
						|  | ( | 
					
						
						|  | instruction, | 
					
						
						|  | input, | 
					
						
						|  | response, | 
					
						
						|  | system, | 
					
						
						|  | ) = self.parse_instruction_fields(prompt) | 
					
						
						|  | user_prompt = next( | 
					
						
						|  | iter( | 
					
						
						|  | self.prompter.build_prompt_w_system( | 
					
						
						|  | system, | 
					
						
						|  | instruction, | 
					
						
						|  | input, | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  | tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False) | 
					
						
						|  | if not self.train_on_inputs: | 
					
						
						|  | user_prompt_len = len(tokenized_prompt["input_ids"]) | 
					
						
						|  |  | 
					
						
						|  | tokenized_prompt["labels"] = [-100] * user_prompt_len | 
					
						
						|  | tokenized_res_prompt = self._tokenize( | 
					
						
						|  | response, strip_bos_token=True, add_eos_token=True | 
					
						
						|  | ) | 
					
						
						|  | tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"] | 
					
						
						|  | tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"] | 
					
						
						|  | tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"] | 
					
						
						|  |  | 
					
						
						|  | return tokenized_prompt | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class SystemDataPrompter(AlpacaPrompter): | 
					
						
						|  | """ | 
					
						
						|  | Alpaca Style Prompter that uses system prompts from the dataset | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def build_prompt_w_system( | 
					
						
						|  | self, | 
					
						
						|  | system: str, | 
					
						
						|  | instruction: str, | 
					
						
						|  | input: Union[None, str] = None, | 
					
						
						|  | output: Union[None, str] = None, | 
					
						
						|  | ) -> Generator[str, None, None]: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | formatted_sys_prompt = f"### System:\n{system}\n\n" if system else "" | 
					
						
						|  | if input: | 
					
						
						|  | res = formatted_sys_prompt + self.turn_format.format( | 
					
						
						|  | instruction=instruction, input=input | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | res = formatted_sys_prompt + self.turn_no_input_format.format( | 
					
						
						|  | instruction=instruction | 
					
						
						|  | ) | 
					
						
						|  | if output: | 
					
						
						|  | res = f"{res}{output}" | 
					
						
						|  | yield res | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class OpenOrcaSystemDataPrompter(SystemDataPrompter): | 
					
						
						|  | """ | 
					
						
						|  | Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def match_prompt_style(self): | 
					
						
						|  | if self.prompt_style == PromptStyle.INSTRUCT.value: | 
					
						
						|  | self.turn_format = "### User:\n{instruction}\n\n### Additional Context:\n{input}\n\n### Assistant:\n" | 
					
						
						|  | self.turn_no_input_format = "### User:\n{instruction}\n\n### Assistant:\n" | 
					
						
						|  | if self.prompt_style == PromptStyle.CHAT.value: | 
					
						
						|  | self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:" | 
					
						
						|  | self.turn_no_input_format = "USER: {instruction}\nASSISTANT:" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): | 
					
						
						|  | """ | 
					
						
						|  | Tokenizing strategy for OpenOrca datasets | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: | 
					
						
						|  | return ( | 
					
						
						|  | prompt["question"], | 
					
						
						|  | "", | 
					
						
						|  | prompt["response"], | 
					
						
						|  | prompt["system_prompt"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def load(tokenizer, cfg): | 
					
						
						|  | return load_chat(tokenizer, cfg) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def load_instruct(tokenizer, cfg): | 
					
						
						|  | return InstructionWSystemPromptTokenizingStrategy( | 
					
						
						|  | SystemDataPrompter(PromptStyle.INSTRUCT.value), | 
					
						
						|  | tokenizer, | 
					
						
						|  | cfg.train_on_inputs, | 
					
						
						|  | cfg.sequence_len, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def load_chat(tokenizer, cfg): | 
					
						
						|  | return InstructionWSystemPromptTokenizingStrategy( | 
					
						
						|  | SystemDataPrompter(PromptStyle.CHAT.value), | 
					
						
						|  | tokenizer, | 
					
						
						|  | cfg.train_on_inputs, | 
					
						
						|  | cfg.sequence_len, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def load_open_orca(tokenizer, cfg): | 
					
						
						|  | return OpenOrcaPromptTokenizingStrategy( | 
					
						
						|  | OpenOrcaSystemDataPrompter(PromptStyle.INSTRUCT.value), | 
					
						
						|  | tokenizer, | 
					
						
						|  | cfg.train_on_inputs, | 
					
						
						|  | cfg.sequence_len, | 
					
						
						|  | ) | 
					
						
						|  |  |