Shibyan commited on 27 days ago

Commit

222aaae

•

1 Parent(s): 2060dcd

Upload folder using huggingface_hub

Browse files

Files changed (28) hide show

README.md +58 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
all_results.json +9 -0
checkpoint-62/README.md +202 -0
checkpoint-62/adapter_config.json +31 -0
checkpoint-62/adapter_model.safetensors +3 -0
checkpoint-62/optimizer.pt +3 -0
checkpoint-62/qwen.tiktoken +0 -0
checkpoint-62/rng_state.pth +3 -0
checkpoint-62/scheduler.pt +3 -0
checkpoint-62/special_tokens_map.json +10 -0
checkpoint-62/tokenization_qwen.py +276 -0
checkpoint-62/tokenizer_config.json +17 -0
checkpoint-62/trainer_state.json +129 -0
checkpoint-62/training_args.bin +3 -0
llamaboard_config.yaml +66 -0
qwen.tiktoken +0 -0
running_log.txt +2 -0
special_tokens_map.json +10 -0
tokenization_qwen.py +276 -0
tokenizer_config.json +17 -0
train_results.json +9 -0
trainer_log.jsonl +13 -0
trainer_state.json +139 -0
training_args.bin +3 -0
training_args.yaml +32 -0
training_loss.png +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+base_model: qwen/Qwen-1_8B-Chat
+library_name: peft
+license: other
+tags:
+- llama-factory
+- lora
+- generated_from_trainer
+model-index:
+- name: train_2024-08-31-17-40-34
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# train_2024-08-31-17-40-34
+This model is a fine-tuned version of [qwen/Qwen-1_8B-Chat](https://huggingface.co/qwen/Qwen-1_8B-Chat) on the glaive_toolcall_en dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- num_epochs: 1.0
+### Training results
+### Framework versions
+- PEFT 0.12.0
+- Transformers 4.44.2
+- Pytorch 2.4.0
+- Datasets 2.21.0
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "qwen/Qwen-1_8B-Chat",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 4,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "c_proj",
+    "c_attn",
+    "w1",
+    "w2"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd807cad2fb36649906b71719516e895d388c2396c7d72a4e31387147025c7f
+size 13448712

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.992,
+    "num_input_tokens_seen": 610016,
+    "total_flos": 5596354012643328.0,
+    "train_loss": 0.6199409730972782,
+    "train_runtime": 3421.6329,
+    "train_samples_per_second": 0.292,
+    "train_steps_per_second": 0.018
+}

checkpoint-62/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: qwen/Qwen-1_8B-Chat
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

checkpoint-62/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "qwen/Qwen-1_8B-Chat",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 4,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "c_proj",
+    "c_attn",
+    "w1",
+    "w2"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-62/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd807cad2fb36649906b71719516e895d388c2396c7d72a4e31387147025c7f
+size 13448712

checkpoint-62/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5c61e60e5fe23b7a52c5dfb4349c5c80facbe1c4b0565fa2418530e797c49a6
+size 27031674

checkpoint-62/qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-62/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a74cb2999fd09c30c2676c95a55b375947ad04fa23df46ea458fa59f07eaee5c
+size 13990

checkpoint-62/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d65ff43cc7d5ac74f560f6c36d19c34fda721bbf1bacbbdd9237f934985640c0
+size 1064

checkpoint-62/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

checkpoint-62/tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 151643
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                IMSTART,
+                IMEND,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in SPECIAL_TOKENS
+        }
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

checkpoint-62/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 8192,
+  "pad_token": "<|im_end|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "QWenTokenizer"
+}

checkpoint-62/trainer_state.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7084760069847107,
+      "learning_rate": 4.920192797165511e-05,
+      "loss": 0.8189,
+      "num_input_tokens_seen": 54176,
+      "step": 5
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7297347784042358,
+      "learning_rate": 4.685866540361456e-05,
+      "loss": 0.6742,
+      "num_input_tokens_seen": 102528,
+      "step": 10
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 684049.0625,
+      "learning_rate": 4.3119819680728e-05,
+      "loss": 0.6816,
+      "num_input_tokens_seen": 152912,
+      "step": 15
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5369439125061035,
+      "learning_rate": 3.822410025817406e-05,
+      "loss": 0.6588,
+      "num_input_tokens_seen": 204432,
+      "step": 20
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5415367484092712,
+      "learning_rate": 3.2484078074333954e-05,
+      "loss": 0.5882,
+      "num_input_tokens_seen": 254512,
+      "step": 25
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7139325737953186,
+      "learning_rate": 2.6266229220967818e-05,
+      "loss": 0.6473,
+      "num_input_tokens_seen": 298608,
+      "step": 30
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2127319.0,
+      "learning_rate": 1.9967536997783494e-05,
+      "loss": 0.6343,
+      "num_input_tokens_seen": 348928,
+      "step": 35
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5908939.5,
+      "learning_rate": 1.399014621105914e-05,
+      "loss": 0.5067,
+      "num_input_tokens_seen": 397216,
+      "step": 40
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1202378.875,
+      "learning_rate": 8.715687931944449e-06,
+      "loss": 0.5386,
+      "num_input_tokens_seen": 444832,
+      "step": 45
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4307733178138733,
+      "learning_rate": 4.480913969818098e-06,
+      "loss": 0.563,
+      "num_input_tokens_seen": 490640,
+      "step": 50
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 11058075.0,
+      "learning_rate": 1.5561966963229924e-06,
+      "loss": 0.5484,
+      "num_input_tokens_seen": 538400,
+      "step": 55
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 2679230.5,
+      "learning_rate": 1.2826691520262114e-07,
+      "loss": 0.5979,
+      "num_input_tokens_seen": 589856,
+      "step": 60
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 62,
+  "num_input_tokens_seen": 610016,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5596354012643328.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-62/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:311875b8ca25e2de38752c7cd348177c9ca8d80ec40f9c44f87bd2bc51a3e94b
+size 5368

llamaboard_config.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+top.booster: auto
+top.checkpoint_path: []
+top.finetuning_type: lora
+top.model_name: Qwen-1.8B-Chat
+top.quantization_bit: none
+top.quantization_method: bitsandbytes
+top.rope_scaling: none
+top.template: qwen
+train.additional_target: ''
+train.badam_mode: layer
+train.badam_switch_interval: 50
+train.badam_switch_mode: ascending
+train.badam_update_ratio: 0.05
+train.batch_size: 2
+train.compute_type: bf16
+train.create_new_adapter: false
+train.cutoff_len: 1024
+train.dataset:
+- glaive_toolcall_en
+train.dataset_dir: data
+train.ds_offload: false
+train.ds_stage: none
+train.freeze_extra_modules: ''
+train.freeze_trainable_layers: 2
+train.freeze_trainable_modules: all
+train.galore_rank: 16
+train.galore_scale: 0.25
+train.galore_target: all
+train.galore_update_interval: 200
+train.gradient_accumulation_steps: 8
+train.learning_rate: 5e-5
+train.logging_steps: 5
+train.lora_alpha: 16
+train.lora_dropout: 0
+train.lora_rank: 8
+train.lora_target: ''
+train.loraplus_lr_ratio: 0
+train.lr_scheduler_type: cosine
+train.mask_history: false
+train.max_grad_norm: '1.0'
+train.max_samples: '100000'
+train.neat_packing: false
+train.neftune_alpha: 0
+train.num_train_epochs: '3.0'
+train.optim: adamw_torch
+train.packing: false
+train.ppo_score_norm: false
+train.ppo_whiten_rewards: false
+train.pref_beta: 0.1
+train.pref_ftx: 0
+train.pref_loss: sigmoid
+train.report_to: false
+train.resize_vocab: false
+train.reward_model: null
+train.save_steps: 100
+train.shift_attn: false
+train.train_on_prompt: false
+train.training_stage: Supervised Fine-Tuning
+train.use_badam: false
+train.use_dora: false
+train.use_galore: false
+train.use_llama_pro: false
+train.use_pissa: false
+train.use_rslora: false
+train.val_size: 0
+train.warmup_steps: 0

qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

running_log.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [INFO\|parser.py:352] 2024-08-31 17:53:54,714 >> Process rank: 0, device: mps, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16
2	+

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 151643
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                IMSTART,
+                IMEND,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in SPECIAL_TOKENS
+        }
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 8192,
+  "pad_token": "<|im_end|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "QWenTokenizer"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.992,
+    "num_input_tokens_seen": 610016,
+    "total_flos": 5596354012643328.0,
+    "train_loss": 0.6199409730972782,
+    "train_runtime": 3421.6329,
+    "train_samples_per_second": 0.292,
+    "train_steps_per_second": 0.018
+}

trainer_log.jsonl ADDED Viewed

	@@ -0,0 +1,13 @@

+{"current_steps": 5, "total_steps": 62, "loss": 0.8189, "learning_rate": 4.920192797165511e-05, "epoch": 0.08, "percentage": 8.06, "elapsed_time": "0:04:59", "remaining_time": "0:56:59", "throughput": 180.63, "total_tokens": 54176}
+{"current_steps": 10, "total_steps": 62, "loss": 0.6742, "learning_rate": 4.685866540361456e-05, "epoch": 0.16, "percentage": 16.13, "elapsed_time": "0:09:21", "remaining_time": "0:48:40", "throughput": 182.53, "total_tokens": 102528}
+{"current_steps": 15, "total_steps": 62, "loss": 0.6816, "learning_rate": 4.3119819680728e-05, "epoch": 0.24, "percentage": 24.19, "elapsed_time": "0:14:05", "remaining_time": "0:44:09", "throughput": 180.81, "total_tokens": 152912}
+{"current_steps": 20, "total_steps": 62, "loss": 0.6588, "learning_rate": 3.822410025817406e-05, "epoch": 0.32, "percentage": 32.26, "elapsed_time": "0:18:45", "remaining_time": "0:39:23", "throughput": 181.6, "total_tokens": 204432}
+{"current_steps": 25, "total_steps": 62, "loss": 0.5882, "learning_rate": 3.2484078074333954e-05, "epoch": 0.4, "percentage": 40.32, "elapsed_time": "0:23:35", "remaining_time": "0:34:55", "throughput": 179.77, "total_tokens": 254512}
+{"current_steps": 30, "total_steps": 62, "loss": 0.6473, "learning_rate": 2.6266229220967818e-05, "epoch": 0.48, "percentage": 48.39, "elapsed_time": "0:27:52", "remaining_time": "0:29:44", "throughput": 178.51, "total_tokens": 298608}
+{"current_steps": 35, "total_steps": 62, "loss": 0.6343, "learning_rate": 1.9967536997783494e-05, "epoch": 0.56, "percentage": 56.45, "elapsed_time": "0:32:46", "remaining_time": "0:25:16", "throughput": 177.47, "total_tokens": 348928}
+{"current_steps": 40, "total_steps": 62, "loss": 0.5067, "learning_rate": 1.399014621105914e-05, "epoch": 0.64, "percentage": 64.52, "elapsed_time": "0:37:00", "remaining_time": "0:20:21", "throughput": 178.92, "total_tokens": 397216}
+{"current_steps": 45, "total_steps": 62, "loss": 0.5386, "learning_rate": 8.715687931944449e-06, "epoch": 0.72, "percentage": 72.58, "elapsed_time": "0:41:30", "remaining_time": "0:15:40", "throughput": 178.61, "total_tokens": 444832}
+{"current_steps": 50, "total_steps": 62, "loss": 0.563, "learning_rate": 4.480913969818098e-06, "epoch": 0.8, "percentage": 80.65, "elapsed_time": "0:46:06", "remaining_time": "0:11:03", "throughput": 177.36, "total_tokens": 490640}
+{"current_steps": 55, "total_steps": 62, "loss": 0.5484, "learning_rate": 1.5561966963229924e-06, "epoch": 0.88, "percentage": 88.71, "elapsed_time": "0:50:03", "remaining_time": "0:06:22", "throughput": 179.23, "total_tokens": 538400}
+{"current_steps": 60, "total_steps": 62, "loss": 0.5979, "learning_rate": 1.2826691520262114e-07, "epoch": 0.96, "percentage": 96.77, "elapsed_time": "0:54:46", "remaining_time": "0:01:49", "throughput": 179.48, "total_tokens": 589856}
+{"current_steps": 62, "total_steps": 62, "epoch": 0.992, "percentage": 100.0, "elapsed_time": "0:57:01", "remaining_time": "0:00:00", "throughput": 178.28, "total_tokens": 610016}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7084760069847107,
+      "learning_rate": 4.920192797165511e-05,
+      "loss": 0.8189,
+      "num_input_tokens_seen": 54176,
+      "step": 5
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7297347784042358,
+      "learning_rate": 4.685866540361456e-05,
+      "loss": 0.6742,
+      "num_input_tokens_seen": 102528,
+      "step": 10
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 684049.0625,
+      "learning_rate": 4.3119819680728e-05,
+      "loss": 0.6816,
+      "num_input_tokens_seen": 152912,
+      "step": 15
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5369439125061035,
+      "learning_rate": 3.822410025817406e-05,
+      "loss": 0.6588,
+      "num_input_tokens_seen": 204432,
+      "step": 20
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5415367484092712,
+      "learning_rate": 3.2484078074333954e-05,
+      "loss": 0.5882,
+      "num_input_tokens_seen": 254512,
+      "step": 25
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7139325737953186,
+      "learning_rate": 2.6266229220967818e-05,
+      "loss": 0.6473,
+      "num_input_tokens_seen": 298608,
+      "step": 30
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2127319.0,
+      "learning_rate": 1.9967536997783494e-05,
+      "loss": 0.6343,
+      "num_input_tokens_seen": 348928,
+      "step": 35
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5908939.5,
+      "learning_rate": 1.399014621105914e-05,
+      "loss": 0.5067,
+      "num_input_tokens_seen": 397216,
+      "step": 40
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1202378.875,
+      "learning_rate": 8.715687931944449e-06,
+      "loss": 0.5386,
+      "num_input_tokens_seen": 444832,
+      "step": 45
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4307733178138733,
+      "learning_rate": 4.480913969818098e-06,
+      "loss": 0.563,
+      "num_input_tokens_seen": 490640,
+      "step": 50
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 11058075.0,
+      "learning_rate": 1.5561966963229924e-06,
+      "loss": 0.5484,
+      "num_input_tokens_seen": 538400,
+      "step": 55
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 2679230.5,
+      "learning_rate": 1.2826691520262114e-07,
+      "loss": 0.5979,
+      "num_input_tokens_seen": 589856,
+      "step": 60
+    },
+    {
+      "epoch": 0.992,
+      "num_input_tokens_seen": 610016,
+      "step": 62,
+      "total_flos": 5596354012643328.0,
+      "train_loss": 0.6199409730972782,
+      "train_runtime": 3421.6329,
+      "train_samples_per_second": 0.292,
+      "train_steps_per_second": 0.018
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 62,
+  "num_input_tokens_seen": 610016,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5596354012643328.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:311875b8ca25e2de38752c7cd348177c9ca8d80ec40f9c44f87bd2bc51a3e94b
+size 5368

training_args.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+bf16: true
+cutoff_len: 1024
+dataset: glaive_toolcall_en
+dataset_dir: data
+ddp_timeout: 180000000
+do_train: true
+finetuning_type: lora
+flash_attn: auto
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+learning_rate: 5.0e-05
+logging_steps: 5
+lora_alpha: 16
+lora_dropout: 0
+lora_rank: 8
+lora_target: all
+lr_scheduler_type: cosine
+max_grad_norm: 1.0
+max_samples: 100000
+model_name_or_path: qwen/Qwen-1_8B-Chat
+num_train_epochs: 3.0
+optim: adamw_torch
+output_dir: saves/Qwen-1.8B-Chat/lora/train_2024-08-31-17-40-34
+packing: false
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: none
+save_steps: 100
+stage: sft
+template: qwen
+warmup_steps: 0

training_loss.png ADDED Viewed