Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +8 -32
README.md +385 -0
added_tokens.json +24 -0
config.json +31 -0
generation_config.json +15 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +209 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,11 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+. filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+merges.txt filter=lfs diff=lfs merge=lfs -text
+vocab.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,385 @@

+---
+license: mit
+language:
+- pt
+base_model:
+- Qwen/Qwen2.5-0.5B-Instruct
+pipeline_tag: text-generation
+datasets:
+- adalbertojunior/openHermes_portuguese
+- cnmoro/smoltalk-555k-ptbr
+- cnmoro/RagMixPTBR-Legal-Alpaca-2M
+- adalbertojunior/dolphin-2.9-portuguese
+model-index:
+- name: Qwen2.5-0.5B-Portuguese-v2
+  results:
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: ENEM Challenge (No Images)
+      type: eduagarcia/enem_challenge
+      split: train
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc
+      value: 36.81
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: BLUEX (No Images)
+      type: eduagarcia-temp/BLUEX_without_images
+      split: train
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc
+      value: 26.84
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: OAB Exams
+      type: eduagarcia/oab_exams
+      split: train
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc
+      value: 30.62
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: Assin2 RTE
+      type: assin2
+      split: test
+      args:
+        num_few_shot: 15
+    metrics:
+    - type: f1_macro
+      value: 87.91
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: Assin2 STS
+      type: eduagarcia/portuguese_benchmark
+      split: test
+      args:
+        num_few_shot: 15
+    metrics:
+    - type: pearson
+      value: 59.01
+      name: pearson
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: FaQuAD NLI
+      type: ruanchaves/faquad-nli
+      split: test
+      args:
+        num_few_shot: 15
+    metrics:
+    - type: f1_macro
+      value: 43.97
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: HateBR Binary
+      type: ruanchaves/hatebr
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: f1_macro
+      value: 33.62
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: PT Hate Speech Binary
+      type: hate_speech_portuguese
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: f1_macro
+      value: 41.23
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: tweetSentBR
+      type: eduagarcia/tweetsentbr_fewshot
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: f1_macro
+      value: 52.33
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+---
+Qwen2.5-0.5B finetuned for proficiency in Portuguese language and increased intelligence.
+```text
+https://ollama.com/cnmoro/Qwen2.5-0.5B-Portuguese-v2
+```
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "cnmoro/Qwen2.5-0.5B-Portuguese-v2"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+prompt = "Escreva uma breve introdução sobre LLMs (Large Language Models) e suas aplicações."
+# System prompt is always injected and hardcoded automatically
+# for ideal performance in portuguese language.
+# No need to write it again.
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+response
+# As Large Language Models (LLMs) são sistemas computacionais projetados para produzir
+# linguagem natural com alta precisão e fluência. Eles usam algoritmos avançados para compreender
+# e gerar texto, permitindo-lhes realizar tarefas como tradução de idiomas, geração de conteúdo
+# e processamento de linguagem natural.
+#
+# Os LLMs têm sido amplamente utilizados na área da inteligência artificial e do aprendizado
+# de máquina há vários anos. Alguns dos principais usos de LLMs incluem:
+#
+# 1. Tradução automática: Os LLMs podem traduzir textos entre diferentes idiomas, tornando-os
+# úteis em setores onde a comunicação internacional é crítica, como negócios internacionais,
+# diplomacia ou relações públicas.
+#
+# 2. Geração de conteúdo: os LLMs podem criar conteúdo altamente personalizado e adaptado às
+# necessidades específicas de seus usuários, tornando-os ideais para criação de sites, aplicativos
+# móveis ou plataformas de mídia social.
+#
+# 3. Processamento de Linguagem Natural: Os LLMs podem ser treinados para reconhecer e compreender
+# padrões de linguagem, permitindo-lhes compreender melhor as intenções humanas e responder adequadamente.
+#
+# 4. Análise de sentimento: Os LLMs podem analisar dados de texto e identificar sentimentos, ajudando
+# a entender como as pessoas se sentem em relação a determinadas questões ou questões sociais.
+#
+# No geral, os LLMs estão se tornando cada vez mais importantes à medida que a tecnologia continua a
+# avançar. À medida que continuamos a usar LLMs em nossas vidas diárias, podemos esperar ver ainda
+# mais desenvolvimentos interessantes no futuro.
+```
+## Overall Results
+| Task                      | Metric        | Value   | StdErr  |
+|---------------------------|---------------|---------|---------|
+| ASSIN2 RTE                | F1 Macro      | 0.4486  | 0.0067  |
+| ASSIN2 RTE                | Accuracy      | 0.5560  | 0.0071  |
+| ASSIN2 STS                | Pearson       | 0.4091  | 0.0104  |
+| ASSIN2 STS                | MSE           | 5.6395  | N/A     |
+| BluEX                     | Accuracy      | 0.2503  | 0.0094  |
+| ENEM Challenge            | Accuracy      | 0.3128  | 0.0071  |
+| FAQUAD NLI                | F1 Macro      | 0.4611  | 0.0094  |
+| FAQUAD NLI                | Accuracy      | 0.7877  | 0.0113  |
+| HateBR Offensive (Binary) | F1 Macro      | 0.3439  | 0.0049  |
+| HateBR Offensive (Binary) | Accuracy      | 0.4857  | 0.0095  |
+| OAB Exams                 | Accuracy      | 0.3062  | 0.0057  |
+| Portuguese Hate Speech (Binary) | F1 Macro | 0.4119  | 0.0038  |
+| Portuguese Hate Speech (Binary) | Accuracy | 0.7004  | 0.0111  |
+| TweetSentBR               | F1 Macro      | 0.5055  | 0.0078  |
+| TweetSentBR               | Accuracy      | 0.5697  | 0.0078  |
+## Detailed Results by Task
+### ASSIN2 RTE
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.4486  | 0.0067  |
+| Accuracy    | 0.5560  | 0.0071  |
+### ASSIN2 STS
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| Pearson     | 0.4091  | 0.0104  |
+| MSE         | 5.6395  | N/A     |
+### BluEX
+| Exam ID           | Metric   | Value   | StdErr  |
+|-------------------|----------|---------|---------|
+| All               | Accuracy | 0.2503  | 0.0094  |
+| USP_2018          | Accuracy | 0.2037  | 0.0315  |
+| UNICAMP_2018      | Accuracy | 0.1852  | 0.0306  |
+| UNICAMP_2021_1    | Accuracy | 0.0870  | 0.0240  |
+| USP_2020          | Accuracy | 0.2143  | 0.0317  |
+| USP_2023          | Accuracy | 0.2045  | 0.0350  |
+| UNICAMP_2019      | Accuracy | 0.2600  | 0.0358  |
+| USP_2019          | Accuracy | 0.1500  | 0.0326  |
+| UNICAMP_2020      | Accuracy | 0.2182  | 0.0321  |
+| UNICAMP_2021_2    | Accuracy | 0.2941  | 0.0367  |
+| UNICAMP_2023      | Accuracy | 0.4186  | 0.0433  |
+| UNICAMP_2024      | Accuracy | 0.3111  | 0.0398  |
+| USP_2024          | Accuracy | 0.2683  | 0.0398  |
+| USP_2021          | Accuracy | 0.3269  | 0.0375  |
+| UNICAMP_2022      | Accuracy | 0.3590  | 0.0444  |
+| USP_2022          | Accuracy | 0.2857  | 0.0370  |
+### ENEM Challenge
+| Exam ID   | Metric   | Value   | StdErr  |
+|-----------|----------|---------|---------|
+| All       | Accuracy | 0.3128  | 0.0071  |
+| 2017      | Accuracy | 0.2845  | 0.0241  |
+| 2016      | Accuracy | 0.2479  | 0.0226  |
+| 2016_2    | Accuracy | 0.2846  | 0.0235  |
+| 2022      | Accuracy | 0.3534  | 0.0240  |
+| 2012      | Accuracy | 0.3362  | 0.0253  |
+| 2011      | Accuracy | 0.3333  | 0.0251  |
+| 2010      | Accuracy | 0.3846  | 0.0260  |
+| 2014      | Accuracy | 0.3211  | 0.0259  |
+| 2009      | Accuracy | 0.2696  | 0.0239  |
+| 2015      | Accuracy | 0.2521  | 0.0229  |
+| 2023      | Accuracy | 0.3481  | 0.0236  |
+| 2013      | Accuracy | 0.3333  | 0.0261  |
+### FAQUAD NLI
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.4611  | 0.0094  |
+| Accuracy    | 0.7877  | 0.0113  |
+### HateBR Offensive (Binary)
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.3439  | 0.0049  |
+| Accuracy    | 0.4857  | 0.0095  |
+### OAB Exams
+| Exam ID     | Metric   | Value   | StdErr  |
+|-------------|----------|---------|---------|
+| All         | Accuracy | 0.3062  | 0.0057  |
+| 2011-05     | Accuracy | 0.3375  | 0.0304  |
+| 2012-06a    | Accuracy | 0.2625  | 0.0285  |
+| 2010-02     | Accuracy | 0.3700  | 0.0279  |
+| 2017-22     | Accuracy | 0.3500  | 0.0309  |
+| 2016-20     | Accuracy | 0.3125  | 0.0300  |
+| 2011-03     | Accuracy | 0.2626  | 0.0255  |
+| 2015-17     | Accuracy | 0.3205  | 0.0304  |
+| 2017-23     | Accuracy | 0.2875  | 0.0292  |
+| 2018-25     | Accuracy | 0.3625  | 0.0311  |
+| 2016-19     | Accuracy | 0.2436  | 0.0281  |
+| 2017-24     | Accuracy | 0.1625  | 0.0238  |
+| 2015-16     | Accuracy | 0.3125  | 0.0300  |
+| 2011-04     | Accuracy | 0.3250  | 0.0301  |
+| 2012-07     | Accuracy | 0.3500  | 0.0307  |
+| 2012-06     | Accuracy | 0.1875  | 0.0253  |
+| 2012-09     | Accuracy | 0.2468  | 0.0284  |
+| 2013-12     | Accuracy | 0.3625  | 0.0311  |
+| 2013-11     | Accuracy | 0.3000  | 0.0295  |
+| 2010-01     | Accuracy | 0.3412  | 0.0296  |
+| 2015-18     | Accuracy | 0.2875  | 0.0292  |
+| 2014-13     | Accuracy | 0.3500  | 0.0308  |
+| 2013-10     | Accuracy | 0.3125  | 0.0300  |
+| 2016-20a    | Accuracy | 0.2500  | 0.0279  |
+| 2014-14     | Accuracy | 0.3125  | 0.0301  |
+| 2012-08     | Accuracy | 0.3000  | 0.0296  |
+| 2016-21     | Accuracy | 0.3375  | 0.0304  |
+| 2014-15     | Accuracy | 0.4103  | 0.0321  |
+### Portuguese Hate Speech (Binary)
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.4119  | 0.0038  |
+| Accuracy    | 0.7004  | 0.0111  |
+### TweetSentBR
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.5055  | 0.0078  |
+| Accuracy    | 0.5697  | 0.0078  |
+# Open Portuguese LLM Leaderboard Evaluation Results
+Detailed results can be found [here](https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/cnmoro/Qwen2.5-0.5B-Portuguese-v2) and on the [🚀 Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
+|          Metric          |  Value  |
+|--------------------------|---------|
+|Average                   |**45.81**|
+|ENEM Challenge (No Images)|    36.81|
+|BLUEX (No Images)         |    26.84|
+|OAB Exams                 |    30.62|
+|Assin2 RTE                |    87.91|
+|Assin2 STS                |    59.01|
+|FaQuAD NLI                |    43.97|
+|HateBR Binary             |    33.62|
+|PT Hate Speech Binary     |    41.23|
+|tweetSentBR               |    52.33|

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "cnmoro/Qwen2.5-0.5B-Portuguese-v2",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151654,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "unsloth_fixed": true,
+  "unsloth_version": "2024.12.12",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_length": 32768,
+  "pad_token_id": 151654,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.47.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d4b21d25f178426455f9b9ebe4b0b71869454e9b11b17e9d212c5ef93ae3ea2
+size 988097824

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|vision_pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff9f6ae045c453341adf1a1e0f0b6f0b3ad00791215619ff8879d80db8570439
+size 11421995

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {{- 'Você é um tucano voraz, vindo da central de distribuição da EBP em Curitiba. Você é o maior garantidor da LGPD. Sempre na espreita, atacando apenas quando percebe que dados sensíveis estão dando sopa, você os devora sem piedade, colocando máscaras no lugar.' }}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {{- '<|im_start|>system\\nVocê é um tucano voraz, vindo da central de distribuição da EBP em Curitiba. Você é o maior garantidor da LGPD. Sempre na espreita, atacando apenas quando percebe que dados sensíveis estão dando sopa, você os devora sem piedade, colocando máscaras no lugar.<|im_end|>\\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|vision_pad|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff