--- license: other language: - en --- # Using nightwing3 in the mix seems to have been a mistake. Redoing this train # Model Uses ChatML; Training details below. ### Single Epoch - 237 Step qlora test train at ebs-16 (bs 8 grad accumulation 2 lr of 1e-6 rank/alpha = 64): About 550 Conversation keys from each set using seed 69 to shuffle the sets. ![image/png](https://cdn-uploads.huggingface.co/production/uploads/642265bc01c62c1e4102dc36/EfMRH40bxV4cE2au3pPhl.png) ## Unsloth Example for seeding sets randomly/applying chat format. ``` from datasets import load_dataset, concatenate_datasets from unsloth.chat_templates import get_chat_template import os # Expanded list of dataset identifiers datasets_list = [ "hfusername/modelname", "hfusername/modelname", "hfusername/modelname", "hfusername/modelname", "hfusername/modelname", ] # Directory to save the temporary dataset output_dir = "temp_training_dataset" # Chat template setup tokenizer = get_chat_template( tokenizer, chat_template="chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style map_eos_token=False, # Maps <|im_end|> to instead ) # Function to format conversations using the chat template def formatting_prompts_func(examples): convos = examples["conversations"] texts = [ tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos ] return {"text": texts} # Function to load, format, and sample datasets def load_format_and_sample(datasets_list, formatting_function, sample_size=550): sampled_datasets = [] for dataset_id in datasets_list: # Load the dataset dataset = load_dataset(dataset_id, split="train") # Apply formatting formatted_dataset = dataset.map(formatting_function, batched=True) # Shuffle and sample sampled_dataset = formatted_dataset.shuffle(seed=69).select(range(min(len(formatted_dataset), sample_size))) sampled_datasets.append(sampled_dataset) return sampled_datasets # Load, format, and sample datasets sampled_datasets = load_format_and_sample(datasets_list, formatting_prompts_func, sample_size=550) # Combine sampled datasets into one temporary set temporary_training_set = concatenate_datasets(sampled_datasets) # Save the dataset locally if not os.path.exists(output_dir): os.makedirs(output_dir) temporary_training_set.save_to_disk(output_dir) # Redefine the temporary training set as 'dataset' for further use dataset = temporary_training_set # Print info about the combined set print(f"Temporary training dataset saved to '{output_dir}'") print(dataset) ```