Initial Upload

Browse files

Files changed (8) hide show

README.md +85 -0
config.json +56 -0
generation_config.json +5 -0
model.safetensors +3 -0
quantize_config.json +25 -0
special_tokens_map.json +41 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+license: other
+license_name: falcon-llm-license
+license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
+language:
+  - en
+  - fr
+  - es
+  - pt
+pipeline_tag: text-generation
+tags:
+- causal-lm
+- autoround
+- auto-round
+- intel-autoround
+- woq
+- autogptq
+- auto-gptq
+- gptq
+- intel
+- pytorch
+- falcon3
+model_name: Falcon3 3B Base
+base_model:
+  - tiiuae/Falcon3-3B-Base
+inference: false
+library_name: transformers
+model_creator: tiiuae
+prompt_template: '{prompt} '
+quantized_by: fbaldassarri
+---
+## Model Information
+Quantized version of [tiiuae/Falcon3-3B-Base](https://huggingface.co/tiiuae/Falcon3-3B-Base) using torch.float32 for quantization tuning.
+- 4 bits (INT4)
+- group size = 128
+- Symmetrical Quantization
+- Method AutoGPTQ
+Quantization framework: [Intel AutoRound](https://github.com/intel/auto-round) v0.4.4
+Note: this INT4 version of Falcon3-3B-Base has been quantized to run inference through CPU.
+## Replication Recipe
+### Step 1 Install Requirements
+I suggest to install requirements into a dedicated python-virtualenv or a conda enviroment.
+```
+wget https://github.com/intel/auto-round/archive/refs/tags/v0.4.4.tar.gz
+tar -xvzf v0.4.4.tar.gz
+cd auto-round-0.4.4
+pip install -r requirements-cpu.txt --upgrade
+```
+### Step 2 Build Intel AutoRound wheel from sources
+```
+pip install -vvv --no-build-isolation -e .[cpu]
+```
+### Step 3 Script for Quantization
+```
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  model_name = "tiiuae/Falcon3-3B-Base"
+  model = AutoModelForCausalLM.from_pretrained(model_name)
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  from auto_round import AutoRound
+  bits, group_size, sym, device, amp = 4, 128, True, 'cpu', False
+  autoround = AutoRound(model, tokenizer, nsamples=128, iters=200, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym, device=device, amp=amp)
+  autoround.quantize()
+  output_dir = "./AutoRound/tiiuae_Falcon3-3B-Base-autogptq-int4-gs128-sym"
+  autoround.save_quantized(output_dir, format='auto_gptq', inplace=True)
+```
+## License
+[Falcon3 License](https://falconllm.tii.ae/falcon-terms-and-conditions.html)
+## Disclaimer
+This quantized model comes with no warranty. It has been developed only for research purposes.

config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_name_or_path": "tiiuae/Falcon3-3B-Base",
+  "activation": "swiglu",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 11,
+  "head_dim": 256,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "max_position_embeddings": 32768,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "amp": false,
+    "autoround_version": "0.4.3",
+    "batch_size": 4,
+    "bits": 4,
+    "damp_percent": 0.01,
+    "data_type": "int",
+    "desc_act": false,
+    "enable_minmax_tuning": true,
+    "enable_norm_bias_tuning": false,
+    "enable_quanted_input": true,
+    "gradient_accumulate_steps": 1,
+    "group_size": 128,
+    "iters": 200,
+    "low_gpu_mem_usage": false,
+    "lr": 0.005,
+    "minmax_lr": 0.005,
+    "nsamples": 128,
+    "quant_method": "gptq",
+    "scale_dtype": "torch.float16",
+    "seqlen": 512,
+    "sym": true,
+    "to_quant_block_names": null,
+    "true_sequential": false
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000042,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0",
+  "use_cache": true,
+  "vocab_size": 131072
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 11,
+  "transformers_version": "4.48.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3586c67e5e29d510b2ddc6ebf783924961cab1ad2b7174a880d3e46e48ec5436
+size 4482700120

quantize_config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "bits": 4,
+  "group_size": 128,
+  "sym": true,
+  "data_type": "int",
+  "enable_quanted_input": true,
+  "enable_minmax_tuning": true,
+  "seqlen": 512,
+  "batch_size": 4,
+  "scale_dtype": "torch.float16",
+  "lr": 0.005,
+  "minmax_lr": 0.005,
+  "gradient_accumulate_steps": 1,
+  "iters": 200,
+  "amp": false,
+  "nsamples": 128,
+  "low_gpu_mem_usage": false,
+  "to_quant_block_names": null,
+  "enable_norm_bias_tuning": false,
+  "autoround_version": "0.4.3",
+  "quant_method": "gptq",
+  "desc_act": false,
+  "true_sequential": false,
+  "damp_percent": 0.01
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "additional_special_tokens": [
+    ">>TITLE<<",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>EMAIL_ADDRESS<<",
+    ">>IP_ADDRESS<<",
+    "<|startoftext|>",
+    ">>IP_ADDRESS_0<<",
+    ">>IP_ADDRESS_1<<",
+    ">>IP_ADDRESS_2<<",
+    ">>IP_ADDRESS_3<<",
+    ">>IP_ADDRESS_4<<",
+    ">>IP_ADDRESS_5<<",
+    ">>IP_ADDRESS_6<<",
+    ">>IP_ADDRESS_7<<",
+    ">>IP_ADDRESS_8<<",
+    ">>IP_ADDRESS_9<<",
+    ">>PASSWORD<<",
+    ">>KEY<<"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff