agoor97 commited on Apr 22

Commit

16ffc97

verified ·

1 Parent(s): 4c59cd9

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -34
.gitignore +175 -0
CMD.md +91 -0
README.md +60 -0
hf_upload.ipynb +358 -0
old_scripts/convert_for_unity.py +1024 -0
old_scripts/convert_single_model.py +492 -0
old_scripts/convert_to_onnx.py +261 -0
old_scripts/test_chat.py +402 -0
onnx_models/bloom_onnx/config.json +32 -0
onnx_models/bloom_onnx/generation_config.json +7 -0
onnx_models/bloom_onnx/model.onnx +3 -0
onnx_models/bloom_onnx/special_tokens_map.json +30 -0
onnx_models/bloom_onnx/tokenizer.json +3 -0
onnx_models/bloom_onnx/tokenizer_config.json +48 -0
onnx_models/bloom_onnx_quantized/config.json +32 -0
onnx_models/bloom_onnx_quantized/model_quantized.onnx +3 -0
onnx_models/bloom_onnx_quantized/ort_config.json +33 -0
onnx_models/bloom_onnx_quantized/special_tokens_map.json +30 -0
onnx_models/bloom_onnx_quantized/tokenizer.json +3 -0
onnx_models/bloom_onnx_quantized/tokenizer_config.json +48 -0
onnx_models/falcon_onnx/config.json +41 -0
onnx_models/falcon_onnx/generation_config.json +6 -0
onnx_models/falcon_onnx/merges.txt +0 -0
onnx_models/falcon_onnx/model.onnx +3 -0
onnx_models/falcon_onnx/special_tokens_map.json +23 -0
onnx_models/falcon_onnx/tokenizer.json +0 -0
onnx_models/falcon_onnx/tokenizer_config.json +20 -0
onnx_models/falcon_onnx/vocab.json +0 -0
onnx_models/gpt2_onnx/config.json +41 -0
onnx_models/gpt2_onnx/generation_config.json +6 -0
onnx_models/gpt2_onnx/merges.txt +0 -0
onnx_models/gpt2_onnx/model.onnx +3 -0
onnx_models/gpt2_onnx/special_tokens_map.json +5 -0
onnx_models/gpt2_onnx/tokenizer.json +0 -0
onnx_models/gpt2_onnx/tokenizer_config.json +20 -0
onnx_models/gpt2_onnx/vocab.json +0 -0
onnx_models/gpt2_onnx_quantized/config.json +41 -0
onnx_models/gpt2_onnx_quantized/merges.txt +0 -0
onnx_models/gpt2_onnx_quantized/model_quantized.onnx +3 -0
onnx_models/gpt2_onnx_quantized/ort_config.json +33 -0
onnx_models/gpt2_onnx_quantized/special_tokens_map.json +23 -0
onnx_models/gpt2_onnx_quantized/tokenizer.json +0 -0
onnx_models/gpt2_onnx_quantized/tokenizer_config.json +20 -0
onnx_models/gpt2_onnx_quantized/vocab.json +0 -0
onnx_models/opt_onnx/config.json +31 -0
onnx_models/opt_onnx/generation_config.json +7 -0
onnx_models/opt_onnx/merges.txt +0 -0
onnx_models/opt_onnx/model.onnx +3 -0
onnx_models/opt_onnx/special_tokens_map.json +30 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,9 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.onnx_data filter=lfs diff=lfs merge=lfs -text
+onnx_models/bloom_onnx/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+onnx_models/bloom_onnx_quantized/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+onnx_models/qwen_onnx/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+onnx_models/qwen_onnx_quantized/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+onnx_models/tinyllama_onnx/tokenizer.model filter=lfs diff=lfs merge=lfs -text
+onnx_models/tinyllama_onnx_quantized/tokenizer.model filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+*.onnx_data

CMD.md ADDED Viewed

	@@ -0,0 +1,91 @@

+### Qwen-0.5B Model
+``` bash
+# Step 1: Export for text generation with past KV cache (better for chat)
+echo "Exporting Qwen-0.5B..."
+optimum-cli export onnx --model Qwen/Qwen1.5-0.5B --task text-generation-with-past onnx_models/qwen_onnx/
+# Step 2: Quantize for ARM64 (Mobile target) using static INT8 quantization
+echo "Quantizing Qwen-0.5B for ARM64 (Static)..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/qwen_onnx/ --arm64 -o onnx_models/qwen_onnx_quantized/
+```
+-----------------------------------
+### TinyLlama-1.1B
+``` bash
+# Step 1: Export for text generation with past KV cache (better for chat)
+echo "Exporting TinyLlama-1.1B..."
+optimum-cli export onnx --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --task text-generation-with-past onnx_models/tinyllama_onnx/
+# Step 2: Attempt Quantization for ARM64 (Static INT8)
+# This is the step you mentioned takes too long or fails. Try it, but have the alternative ready.
+echo "Attempting TinyLlama-1.1B quantization for ARM64 (Static)..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/tinyllama_onnx/ --arm64 -o onnx_models/tinyllama_onnx_quantized/
+```
+-----------------------------------
+### Phi-1.5 Model
+``` bash
+# Step 1: Export for text generation with past KV cache (better for chat)
+echo "Exporting Phi-1.5..."
+optimum-cli export onnx --model microsoft/phi-1_5 --task text-generation-with-past onnx_models/phi_onnx/
+# Step 2: Attempt Quantization for ARM64 (Static INT8) -- Failed with me (need much memory)
+echo "Quantizing Phi-1.5 for ARM64 (Static)..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/phi_onnx/ --arm64 -o onnx_models/phi_onnx_quantized/
+```
+-----------------------------------
+### Falcon-1B Model
+``` bash
+# Export
+echo "Exporting Falcon-1B..."
+optimum-cli export onnx --model tiiuae/falcon-rw-1b --task text-generation-with-past onnx_models/falcon_onnx/
+# Quantize for ARM64 -- Failed with me (need much memory)
+echo "Quantizing Falcon-1B for ARM64..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/falcon_onnx/ --arm64 -o onnx_models/falcon_onnx_quantized/
+```
+-----------------------------------
+### GPT-2Medium Model
+``` bash
+# Export GPT2-Medium
+echo "Exporting GPT2-Medium..."
+optimum-cli export onnx --model gpt2-medium --task text-generation-with-past onnx_models/gpt2_onnx/
+# Quantize for ARM64
+echo "Quantizing GPT2-Medium for ARM64..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/gpt2_onnx/ --arm64 -o onnx_models/gpt2_onnx_quantized/
+```
+-----------------------------------
+### OPT-350M Model
+``` bash
+# Export OPT-350M
+echo "Exporting OPT-350M..."
+optimum-cli export onnx --model facebook/opt-350m --task text-generation-with-past onnx_models/opt_onnx/
+# Quantize for ARM64
+echo "Quantizing OPT-350M for ARM64..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/opt_onnx/ --arm64 -o onnx_models/opt_onnx_quantized/
+```
+-----------------------------------
+### Bloom-560M Model
+``` bash
+# Export Bloom-560M
+echo "Exporting Bloom-560M..."
+optimum-cli export onnx --model bigscience/bloom-560m --task text-generation-with-past onnx_models/bloom_onnx/
+# Quantize for ARM64
+echo "Quantizing Bloom-560M for ARM64..."
+optimum-cli onnxruntime quantize --onnx_model onnx_models/bloom_onnx/ --arm64 -o onnx_models/bloom_onnx_quantized/
+```
+-----------------------------------

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# 🚀 LLM to ONNX Converter
+> Convert small language models to ONNX format with **guaranteed reliability** for RAG and chatbot applications on resource-constrained hardware.
+## 📋 Overview
+This repository provides scripts to convert small language models to ONNX format and create INT8 quantized versions for efficient deployment on resource-constrained devices. Perfect for mobile applications, Unity game engines, and embedded systems.
+## ✅ Tested Models
+We've successfully tested the following models with example outputs:
+| Model | Size | Quantized | Response Quality | Speed (sec) |
+|-------|------|-----------|-----------------|-------------|
+| Qwen-0.5B | 500M | ✅ | ❌ Poor | 8.37 |
+| Qwen-0.5B | 500M | ❌ | ✅ Good | 15.69 |
+| TinyLlama-1.1B | 1.1B | ✅ | ❌ Poor | 10.15 |
+| TinyLlama-1.1B | 1.1B | ❌ | ✅ Good | 19.23 |
+| Phi-1.5 | 1.3B | ❌ | ✅ Good | 15.32 |
+| Falcon-RW-1B | 1B | ❌ | ✅ Good | 21.56 |
+| GPT2-Medium | 355M | ✅ | ✅ Good | 6.27 |
+| GPT2-Medium | 355M | ❌ | ✅ Good | 12.77 |
+| OPT-350M | 350M | ✅ | ✅ Good | 4.33 |
+| OPT-350M | 350M | ❌ | ✅ Good | 10.42 |
+| Bloom-560M | 560M | ✅ | ❌ Poor | 11.93 |
+| Bloom-560M | 560M | ❌ | ✅ Good | 34.38 |
+## 🌟 Recommendations
+Based on our testing:
+1. **For best speed + quality:** OPT-350M (quantized) - fastest with good quality
+2. **For best overall quality:** Phi-1.5 (non-quantized) - excellent responses
+3. **For smallest size:** GPT2-Medium or OPT-350M (quantized) - small with good performance
+## 🚩 Key Findings
+- Quantization provides ~2x speed improvement
+- Smaller models (350-500M) quantize better than larger models (1B+)
+- Some architectures (OPT, GPT2) handle quantization better than others
+## 📁 Repository Structure
+```
+onnx_models/
+├── bloom_onnx/
+├── bloom_onnx_quantized/
+├── falcon_onnx/
+├── gpt2_onnx/
+├── gpt2_onnx_quantized/
+├── opt_onnx/
+├── opt_onnx_quantized/
+├── phi_onnx/
+├── qwen_onnx/
+├── qwen_onnx_quantized/
+├── tinyllama_onnx/
+└── tinyllama_onnx_quantized/
+```
+## 📚 Requirements
+- Python 3.8+
+- optimum
+- onnxruntime
+- transformers
+- numpy
+---------------

hf_upload.ipynb ADDED Viewed

	@@ -0,0 +1,358 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "22fbff0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a10a920b0f9749058ee8dd5ce613705a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25711ffa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/administrator/miniconda/lib/python3.12/site-packages/huggingface_hub/hf_api.py:9561: UserWarning: Warnings while validating metadata in README.md:\n",
+      "- empty or missing yaml metadata in repo card\n",
+      "  warnings.warn(f\"Warnings while validating metadata in README.md:\\n{message}\")\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "09c1ecf794c84518803cca555425306a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/798k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "32f7fccea8414cac920db0d57af0e7d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ca28ba13ab1040d7bdf3c44430b5a266",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/655k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dc762c21c9a64897b160c8bc0a745942",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 18 LFS files:   0%|          | 0/18 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c80e14a6ec54618bda83086bd6ab6d3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model_quantized.onnx:   0%|          | 0.00/561M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "94a1fd0e86f74aa99cdce9b12d4b9bc3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f4b966da10894d32815492fae6e891d1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/1.42G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28bf8c5b822c40639b24d588e968dadd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model_quantized.onnx:   0%|          | 0.00/357M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "483803bf88fc4e41ae696925d39ae6e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/1.33G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55056aef07564537b9d8f6b6e2d9d87c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model_quantized.onnx:   0%|          | 0.00/333M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0442e7fbdea74b89950176cef30930dc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/814k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f46c2c40bc2c4686920bd0deb3259df6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/1.86G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bb2d49b750ef4399ae2defea1fb1593d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "418c0351956b4a75854237f5ec3077c1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model_quantized.onnx:   0%|          | 0.00/466M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd17923e98b741e6bc159b25d2a25717",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "93080e4239d84969a69f2c2f86b658b3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/987k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ea2fda33732745e48f7e1a6e1dae5ecc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "851fad297317474495592a8db14aabf5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model_quantized.onnx:   0%|          | 0.00/1.10G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "18d5836cab4f440799db945c1af7cfeb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import upload_folder\n",
+    "\n",
+    "# Path to your offline-models directory\n",
+    "folder_path = \"/home/administrator/offline-rag-model/offline-models\"\n",
+    "\n",
+    "# Your Hugging Face repository name\n",
+    "repo_name = \"onnx-models\"\n",
+    "\n",
+    "# Upload all files to Hugging Face\n",
+    "upload_folder(\n",
+    "    folder_path=folder_path,\n",
+    "    repo_id=f\"agoor97/{repo_name}\",\n",
+    "    repo_type=\"model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67055a1c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

old_scripts/convert_for_unity.py ADDED Viewed

	@@ -0,0 +1,1024 @@

+import os
+import gc
+import sys
+import time
+import logging
+import traceback
+import torch
+import warnings
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from tqdm import tqdm
+from onnxruntime.quantization import quantize_dynamic, QuantType
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+# Suppress unhelpful warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+class GenerationWrapper(torch.nn.Module):
+    """
+    Wrapper for model export that handles generation properly.
+    This ensures the model can be correctly used for text generation.
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = model.config
+    def forward(self, input_ids, attention_mask=None):
+        # Return only the logits to avoid complex structures
+        with torch.no_grad():
+            try:
+                # Standard approach for most models
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    use_cache=False,
+                    return_dict=True
+                )
+                return outputs.logits
+            except Exception as e:
+                logger.warning(f"Standard forward pass failed, trying fallback: {str(e)}")
+                # Fallback for models with different API
+                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+                if hasattr(outputs, 'logits'):
+                    return outputs.logits
+                elif isinstance(outputs, tuple) and len(outputs) > 0:
+                    return outputs[0]  # First element is typically logits
+                else:
+                    raise ValueError("Could not extract logits from model outputs")
+def verify_model_generation(model, tokenizer, device="cpu"):
+        """Test model generation capabilities before export"""
+        model.eval()
+        # Use a chat-like prompt for better testing
+        prompt = "User: Hello, how are you today?\nAssistant:"
+        logger.info("Testing model generation...")
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        # Configure generation parameters
+        gen_config = GenerationConfig(
+            max_length=100,
+            do_sample=True,
+            temperature=0.7,
+            num_return_sequences=1,
+        )
+        try:
+            # Try generation
+            with torch.no_grad():
+                outputs = model.generate(
+                    **inputs,
+                    generation_config=gen_config
+                )
+            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            logger.info(f"Test generation result: {generated_text}")
+            if len(generated_text) <= len(prompt):
+                logger.warning("Generation output is not longer than input prompt!")
+            return True
+        except Exception as e:
+            logger.error(f"Generation test failed: {str(e)}")
+            return False
+def test_onnx_model(onnx_path, tokenizer):
+        """Verify the ONNX model can be loaded and run"""
+        try:
+            import onnxruntime as ort
+            logger.info("Testing ONNX model inference...")
+            session = ort.InferenceSession(onnx_path)
+            # Get input and output names
+            input_names = [input.name for input in session.get_inputs()]
+            output_names = [output.name for output in session.get_outputs()]
+            # Create test input
+            prompt = "User: Hello, how are you?\nAssistant:"
+            inputs = tokenizer(prompt, return_tensors="np")
+            # Prepare input dict
+            onnx_inputs = {}
+            for name in input_names:
+                if name == "input_ids" and "input_ids" in inputs:
+                    onnx_inputs[name] = inputs["input_ids"]
+                elif name == "attention_mask" and "attention_mask" in inputs:
+                    onnx_inputs[name] = inputs["attention_mask"]
+            # Run inference
+            outputs = session.run(output_names, onnx_inputs)
+            # Check output shape
+            logits = outputs[0]
+            logger.info(f"ONNX model output shape: {logits.shape}")
+            if logits.shape[0] != 1 or logits.shape[1] != inputs["input_ids"].shape[1]:
+                logger.warning("Output shape doesn't match expected dimensions!")
+            # Test next token prediction
+            next_token_logits = logits[0, -1, :]
+            next_token_id = np.argmax(next_token_logits)
+            next_token = tokenizer.decode([next_token_id])
+            logger.info(f"Next predicted token: '{next_token}'")
+            return True
+        except Exception as e:
+            logger.error(f"ONNX model test failed: {str(e)}")
+            return False
+def post_process_onnx_for_unity(onnx_path):
+    """
+    Post-process ONNX model to be compatible with Unity Sentis
+    using only core onnx functionality (no onnxsim)
+    """
+    try:
+        import onnx
+        logger.info("Post-processing ONNX model for Unity compatibility...")
+        # First, create a backup of the original model
+        backup_path = onnx_path.replace(".onnx", "_original.onnx")
+        import shutil
+        shutil.copy(onnx_path, backup_path)
+        logger.info(f"Original model backed up to {backup_path}")
+        # Load the model
+        model = onnx.load(onnx_path)
+        # Basic model checks and optimizations
+        try:
+            # Check model validity
+            onnx.checker.check_model(model)
+            logger.info("✓ Model structure validated successfully")
+            # Apply shape inference
+            inferred_model = onnx.shape_inference.infer_shapes(model)
+            onnx.save(inferred_model, onnx_path)
+            logger.info("✓ Applied shape inference")
+        except Exception as e:
+            logger.warning(f"Model validation/optimization error (continuing): {str(e)}")
+        return True
+    except Exception as e:
+        logger.warning(f"ONNX post-processing error (skipping): {str(e)}")
+        return False
+def is_architecture_compatible(model_id):
+        """
+        Check if the model architecture is expected to be compatible with ONNX opset 11
+        """
+        model_id_lower = model_id.lower()
+        # Models known to work with opset 11
+        compatible_architectures = [
+            "gpt2", "distilgpt2", "opt-125m", "opt-350m",
+            "pythia-70m", "pythia-160m", "rwkv", "gpt-neo"
+        ]
+        # Models likely requiring higher opsets (usually 14+)
+        incompatible_architectures = [
+            "llama", "mistral", "mixtral", "tinyllama", "phi-2",
+            "gemma", "falcon", "bloom"
+        ]
+        # Check for compatibility
+        for arch in compatible_architectures:
+            if arch in model_id_lower:
+                return True, 11
+        # Check for known incompatible architectures
+        for arch in incompatible_architectures:
+            if arch in model_id_lower:
+                return False, 14
+        # For phi-1 models, use opset 14 but mark as potentially compatible
+        if "phi-1" in model_id_lower:
+            return True, 14
+        # Default to opset 14 for unknown architectures
+        return False, 14
+def setup_chat_template(model_id, tokenizer):
+        """
+        Setup appropriate chat template based on model architecture
+        """
+        model_id_lower = model_id.lower()
+        # Try to setup chat template if it doesn't have one
+        try:
+            if not hasattr(tokenizer, "chat_template") or tokenizer.chat_template is None:
+                logger.info("Setting up chat template for improved conversations...")
+                # Determine chat template based on model
+                if "gpt2" in model_id_lower or "pythia" in model_id_lower or "opt" in model_id_lower:
+                    # Simple template for base models
+                    chat_template = "{% for message in messages %}\n{% if message['role'] == 'user' %}\nHuman: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}\nAI: {{ message['content'] }}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\nAI: {% endif %}"
+                    tokenizer.chat_template = chat_template
+                    logger.info("✓ Added simple Human/AI chat template")
+                elif "phi" in model_id_lower:
+                    # Microsoft Phi models template
+                    chat_template = "{% for message in messages %}\n{% if message['role'] == 'user' %}\nHuman: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}\nAssistant: {{ message['content'] }}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\nAssistant: {% endif %}"
+                    tokenizer.chat_template = chat_template
+                    logger.info("✓ Added Phi-style Human/Assistant chat template")
+                elif "rwkv" in model_id_lower:
+                    # RWKV template
+                    chat_template = "{% for message in messages %}\n{% if message['role'] == 'user' %}\nUser: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}\nBot: {{ message['content'] }}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\nBot: {% endif %}"
+                    tokenizer.chat_template = chat_template
+                    logger.info("✓ Added RWKV-style User/Bot chat template")
+        except Exception as e:
+            logger.warning(f"Couldn't setup chat template: {str(e)}")
+            logger.info("Chat template setup will need to be handled in Unity")
+def convert_model(model_id, output_dir="./onnx_models", seq_length=32, quantize=True, force_opset=None):
+    """
+    Convert a model to ONNX format with focus on Unity compatibility.
+    Args:
+        model_id: HuggingFace model ID or path
+        output_dir: Directory to save the model
+        seq_length: Input sequence length for export
+        quantize: Whether to quantize the model to INT8
+        force_opset: Force a specific ONNX opset version
+    Returns:
+        bool: Success status
+    """
+    start_time = time.time()
+    # Check model architecture for compatibility
+    is_compatible, recommended_opset = is_architecture_compatible(model_id)
+    # Use forced opset if provided, otherwise use recommended
+    opset_version = force_opset if force_opset is not None else recommended_opset
+    # Warn if using a model that might not be compatible with Unity
+    if not is_compatible and opset_version < 14:
+        logger.warning(f"⚠ Model {model_id} may not be compatible with opset {opset_version}")
+        logger.warning(f"⚠ Recommended opset for this model: {recommended_opset}")
+        logger.warning(f"⚠ You can force a higher opset with --opset {recommended_opset}")
+    logger.info(f"\n{'=' * 60}")
+    logger.info(f"Converting {model_id} to ONNX for Unity (opset {opset_version})")
+    logger.info(f"{'=' * 60}")
+    # Create output directory
+    model_name = model_id.split("/")[-1]
+    model_dir = os.path.join(output_dir, model_name)
+    os.makedirs(model_dir, exist_ok=True)
+    try:
+        # Step 1: Load tokenizer
+        logger.info("Step 1/7: Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        if tokenizer.pad_token is None and hasattr(tokenizer, 'eos_token'):
+            logger.info("Adding pad_token = eos_token")
+            tokenizer.pad_token = tokenizer.eos_token
+        # Setup chat template for better conversation formatting
+        setup_chat_template(model_id, tokenizer)
+        # Save tokenizer
+        tokenizer.save_pretrained(model_dir)
+        logger.info(f"✓ Tokenizer saved to {model_dir}")
+        # Step 2: Load model with reliability optimizations
+        logger.info("Step 2/7: Loading model...")
+        # Clean memory
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Determine device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load model with full precision
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float32,  # Use full precision for reliability
+                low_cpu_mem_usage=True,     # Reduce memory usage
+                device_map=device          # Use CUDA if available
+            )
+        except Exception as e:
+            logger.warning(f"Standard loading failed, trying with 'trust_remote_code=True': {str(e)}")
+            # Some models (like RWKV) need trust_remote_code
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float32,
+                low_cpu_mem_usage=True,
+                device_map=device,
+                trust_remote_code=True
+            )
+        # Save config
+        model.config.save_pretrained(model_dir)
+        logger.info(f"✓ Model config saved to {model_dir}")
+        # Step 3: Verify model can generate chat responses
+        logger.info("Step 3/7: Validating chat capabilities...")
+        if not verify_model_generation(model, tokenizer, device):
+            logger.warning("⚠ Model chat test didn't complete successfully")
+            logger.info("Continuing with export anyway...")
+        # Step 4: Export to ONNX
+        logger.info(f"Step 4/7: Exporting to ONNX format with opset {opset_version}...")
+        # Wrap model with generation-optimized interface
+        wrapped_model = GenerationWrapper(model)
+        wrapped_model.eval()
+        # Clean memory again
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Export to ONNX with appropriate opset version
+        onnx_path = os.path.join(model_dir, "model.onnx")
+        # Create minimal input
+        batch_size = 1
+        dummy_input = torch.ones(batch_size, seq_length, dtype=torch.long)
+        attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long)
+        # Move tensors to correct device
+        dummy_input = dummy_input.to(device)
+        attention_mask = attention_mask.to(device)
+        # Export to ONNX with required opset
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model,                # Wrapped model
+                (dummy_input, attention_mask), # Input tensors
+                onnx_path,                    # Output path
+                export_params=True,           # Store weights
+                opset_version=opset_version,  # Required opset version
+                do_constant_folding=True,     # Optimize constants
+                input_names=['input_ids', 'attention_mask'],  # Input names
+                output_names=['logits'],      # Output name
+                dynamic_axes={                # Dynamic dimensions
+                    'input_ids': {0: 'batch_size', 1: 'sequence'},
+                    'attention_mask': {0: 'batch_size', 1: 'sequence'},
+                    'logits': {0: 'batch_size', 1: 'sequence'}
+                }
+            )
+        # Clean up to save memory
+        del model
+        del wrapped_model
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Verify export success
+        if os.path.exists(onnx_path):
+            size_mb = os.path.getsize(onnx_path) / (1024 * 1024)
+            logger.info(f"✓ ONNX model saved to {onnx_path}")
+            logger.info(f"✓ Original size: {size_mb:.2f} MB")
+            # Step 5: Post-process the ONNX model for better Unity compatibility
+            logger.info("Step 5/7: Post-processing ONNX model for Unity compatibility...")
+            # Try to post-process model for Unity
+            try:
+                post_process_onnx_for_unity(onnx_path)
+            except Exception as e:
+                logger.warning(f"Post-processing failed (non-critical): {str(e)}")
+            # Test ONNX model
+            test_onnx_model(onnx_path, tokenizer)
+            # Step 6: Quantize the model (optional)
+            if quantize:
+                logger.info("Step 6/7: Applying INT8 quantization...")
+                quant_path = onnx_path.replace(".onnx", "_quantized.onnx")
+                try:
+                    with tqdm(total=100, desc="Quantizing") as pbar:
+                        # Update progress callback
+                        def update_progress(x):
+                            pbar.update(1)
+                        # Apply quantization
+                        quantize_dynamic(
+                            model_input=onnx_path,
+                            model_output=quant_path,
+                            per_channel=False,
+                            reduce_range=False,
+                            weight_type=QuantType.QInt8,
+                            optimize_model=True,
+                            use_external_data_format=False
+                        )
+                        pbar.update(100)  # Ensure progress reaches 100%
+                    if os.path.exists(quant_path):
+                        quant_size = os.path.getsize(quant_path) / (1024 * 1024)
+                        logger.info(f"✓ Quantized size: {quant_size:.2f} MB")
+                        logger.info(f"✓ Size reduction: {(1 - quant_size/size_mb) * 100:.1f}%")
+                        # Test the quantized model
+                        test_onnx_model(quant_path, tokenizer)
+                        # Rename original as backup
+                        backup_path = onnx_path.replace(".onnx", "_fp32.onnx")
+                        os.rename(onnx_path, backup_path)
+                        # Replace original with quantized
+                        os.rename(quant_path, onnx_path)
+                        logger.info("✓ Original model preserved as *_fp32.onnx")
+                        logger.info("✓ Replaced original with quantized version")
+                    else:
+                        logger.warning("⚠ Quantized file not created, using original")
+                except Exception as e:
+                    logger.error(f"⚠ Quantization error: {str(e)}")
+                    logger.info("⚠ Using original model without quantization")
+            else:
+                logger.info("Step 6/7: Skipping quantization as requested")
+            # Step 7: Generate Unity integration examples
+            logger.info("Step 7/7: Generating Unity integration examples...")
+            # Create a Unity integration example
+            unity_example_path = os.path.join(model_dir, "unity_integration.cs")
+            with open(unity_example_path, 'w') as f:
+                f.write("""
+using UnityEngine;
+using Unity.Sentis;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+public class ONNXChatbot : MonoBehaviour
+{
+    [SerializeField] private ModelAsset modelAsset;
+    [SerializeField] private TextAsset tokenizerVocabJson;
+    [SerializeField] private int maxTokens = 50;
+    [SerializeField] private float temperature = 0.7f;
+    private IWorker worker;
+    private Dictionary<string, Tensor> inputs;
+    private SimpleTokenizer tokenizer;
+    private bool isGenerating = false;
+    void Start()
+    {
+        // Initialize the model
+        var model = ModelLoader.Load(modelAsset);
+        worker = WorkerFactory.CreateWorker(WorkerFactory.Type.ComputePrecompiled, model);
+        // Initialize tokenizer
+        tokenizer = new SimpleTokenizer(tokenizerVocabJson.text);
+        // Prepare for inference
+        inputs = new Dictionary<string, Tensor>();
+        Debug.Log("Model and tokenizer initialized successfully.");
+    }
+    public async Task<string> GenerateResponseAsync(string userMessage)
+    {
+        if (isGenerating)
+        {
+            Debug.LogWarning("Already generating a response. Please wait.");
+            return "Already generating a response. Please wait.";
+        }
+        isGenerating = true;
+        try
+        {
+            // Format prompt with chat template
+            string prompt = FormatChatPrompt(userMessage);
+            Debug.Log($"Formatted prompt: {prompt}");
+            // Tokenize input
+            var tokenIds = tokenizer.Encode(prompt);
+            Debug.Log($"Encoded to {tokenIds.Length} tokens");
+            if (tokenIds.Length > 0)
+            {
+                // Generate response token by token
+                StringBuilder responseBuilder = new StringBuilder();
+                List<int> currentIds = tokenIds.ToList();
+                for (int i = 0; i < maxTokens; i++)
+                {
+                    // Make sure we don't exceed the model's context window
+                    if (currentIds.Count > 1024)
+                    {
+                        // If too long, keep only the last 1024 tokens
+                        currentIds = currentIds.Skip(currentIds.Count - 1024).Take(1024).ToList();
+                    }
+                    // Create tensors for current sequence
+                    using (var inputIdsTensor = new TensorInt(new TensorShape(1, currentIds.Count), currentIds.ToArray()))
+                    using (var attentionMaskTensor = new TensorInt(new TensorShape(1, currentIds.Count), Enumerable.Repeat(1, currentIds.Count).ToArray()))
+                    {
+                        // Run inference
+                        inputs.Clear();
+                        inputs["input_ids"] = inputIdsTensor;
+                        inputs["attention_mask"] = attentionMaskTensor;
+                        worker.Execute(inputs);
+                        var logits = worker.PeekOutput() as TensorFloat;
+                        // Get next token prediction
+                        int nextToken = SampleNextToken(logits, currentIds, temperature);
+                        // If we hit the end token or a newline after content, stop
+                        if (nextToken == tokenizer.EosToken ||
+                            (i > 0 && nextToken == tokenizer.NewlineToken))
+                        {
+                            break;
+                        }
+                        // Add token to current sequence for next iteration
+                        currentIds.Add(nextToken);
+                        // Decode the latest token
+                        string newToken = tokenizer.Decode(new[] { nextToken });
+                        responseBuilder.Append(newToken);
+                        // For smoother output, yield every few tokens
+                        if (i % 5 == 0)
+                        {
+                            await Task.Delay(1);
+                        }
+                    }
+                }
+                // Return the full response, without the prompt
+                string fullResponse = responseBuilder.ToString();
+                return CleanResponse(fullResponse);
+            }
+            else
+            {
+                Debug.LogError("Tokenization failed: empty token list");
+                return "Sorry, I couldn't process that input.";
+            }
+        }
+        catch (System.Exception ex)
+        {
+            Debug.LogError($"Generation error: {ex.Message}\\n{ex.StackTrace}");
+            return "Sorry, an error occurred while generating a response.";
+        }
+        finally
+        {
+            isGenerating = false;
+        }
+    }
+    private string FormatChatPrompt(string userMessage)
+    {
+        // You may need to adjust this template based on your specific model
+        return $"User: {userMessage}\\nAssistant:";
+    }
+    private string CleanResponse(string response)
+    {
+        // Extract only the Assistant's response
+        int assistantPrefix = response.IndexOf("Assistant:");
+        if (assistantPrefix >= 0)
+        {
+            response = response.Substring(assistantPrefix + "Assistant:".Length).Trim();
+        }
+        // Stop at any "User:" marker if present
+        int nextUser = response.IndexOf("User:");
+        if (nextUser >= 0)
+        {
+            response = response.Substring(0, nextUser).Trim();
+        }
+        return response;
+    }
+    private int SampleNextToken(TensorFloat logits, List<int> currentInputs, float temp)
+    {
+        // Get logits for the last position
+        int lastPos = currentInputs.Count - 1;
+        int vocabSize = logits.shape.channels;
+        // Prepare array for logits
+        float[] lastLogits = new float[vocabSize];
+        // Extract logits for the last token position
+        for (int i = 0; i < vocabSize; i++)
+        {
+            lastLogits[i] = logits[0, lastPos, i];
+        }
+        // Simple temperature-based sampling
+        if (temp <= 0.0f)
+        {
+            // Greedy sampling (argmax)
+            int maxIndex = 0;
+            float maxValue = lastLogits[0];
+            for (int i = 1; i < vocabSize; i++)
+            {
+                if (lastLogits[i] > maxValue)
+                {
+                    maxValue = lastLogits[i];
+                    maxIndex = i;
+                }
+            }
+            return maxIndex;
+        }
+        else
+        {
+            // Temperature sampling
+            // Apply temperature
+            for (int i = 0; i < vocabSize; i++)
+            {
+                lastLogits[i] /= temp;
+            }
+            // Softmax
+            float maxLogit = lastLogits.Max();
+            float sum = 0.0f;
+            for (int i = 0; i < vocabSize; i++)
+            {
+                lastLogits[i] = Mathf.Exp(lastLogits[i] - maxLogit);
+                sum += lastLogits[i];
+            }
+            for (int i = 0; i < vocabSize; i++)
+            {
+                lastLogits[i] /= sum;
+            }
+            // Sample from distribution
+            float random = Random.value;
+            float cumulativeProb = 0.0f;
+            for (int i = 0; i < vocabSize; i++)
+            {
+                cumulativeProb += lastLogits[i];
+                if (random < cumulativeProb)
+                {
+                    return i;
+                }
+            }
+            // Fallback to last token if sampling fails
+            return vocabSize - 1;
+        }
+    }
+    void OnDestroy()
+    {
+        worker?.Dispose();
+    }
+}
+// Simple tokenizer implementation for Unity
+public class SimpleTokenizer
+{
+    private Dictionary<string, int> vocab;
+    private Dictionary<int, string> reversedVocab;
+    public int PadToken { get; private set; }
+    public int EosToken { get; private set; }
+    public int BosToken { get; private set; }
+    public int NewlineToken { get; private set; }
+    public SimpleTokenizer(string vocabJson)
+    {
+        // Parse the vocabulary from JSON
+        vocab = new Dictionary<string, int>();
+        // Simple JSON parsing (you'll need a proper JSON parser in production)
+        string[] entries = vocabJson.Split(new[] { '\\n', '{', '}', '\"', ':', ',' },
+                                        System.StringSplitOptions.RemoveEmptyEntries);
+        for (int i = 0; i < entries.Length - 1; i += 2)
+        {
+            string token = entries[i].Trim();
+            if (int.TryParse(entries[i + 1].Trim(), out int id))
+            {
+                vocab[token] = id;
+            }
+        }
+        // Create reversed vocabulary for decoding
+        reversedVocab = vocab.ToDictionary(kv => kv.Value, kv => kv.Key);
+        // Find special tokens
+        SetSpecialTokens();
+        Debug.Log($"Tokenizer initialized with {vocab.Count} tokens");
+    }
+    private void SetSpecialTokens()
+    {
+        // Try to find standard special tokens
+        PadToken = FindToken(new[] { "<pad>", "[PAD]", "<|endoftext|>" });
+        EosToken = FindToken(new[] { "</s>", "<|endoftext|>", "[EOS]", "<eos>" });
+        BosToken = FindToken(new[] { "<s>", "<|startoftext|>", "[BOS]", "<bos>" });
+        // Find newline token
+        foreach (var entry in vocab)
+        {
+            if (entry.Key == "\\n" || entry.Key == "<\\n>" || entry.Key == "\\n")
+            {
+                NewlineToken = entry.Value;
+                break;
+            }
+        }
+        Debug.Log($"Special tokens - PAD: {PadToken}, EOS: {EosToken}, BOS: {BosToken}, NEWLINE: {NewlineToken}");
+    }
+    private int FindToken(string[] candidates)
+    {
+        foreach (var candidate in candidates)
+        {
+            if (vocab.TryGetValue(candidate, out int id))
+            {
+                return id;
+            }
+        }
+        // Return -1 if not found
+        return -1;
+    }
+    public int[] Encode(string text)
+    {
+    // Simple character-level tokenization
+    // In production, use a proper BPE/WordPiece tokenizer implementation
+    List<int> tokens = new List<int>();
+    StringBuilder currentToken = new StringBuilder();
+    // Add BOS token if available
+    if (BosToken != -1)
+    {
+        tokens.Add(BosToken);
+    }
+    // Very simple tokenization - in production, this would implement
+    // the specific tokenization algorithm for your model
+    foreach (char c in text)
+    {
+        currentToken.Append(c);
+        string current = currentToken.ToString();
+        if (vocab.TryGetValue(current, out int id))
+        {
+            tokens.Add(id);
+            currentToken.Clear();
+        }
+        else if (currentToken.Length > 10)
+        {
+            // If token is too long, add unknown token and reset
+            tokens.Add(vocab.ContainsKey("<unk>") ? vocab["<unk>"] : 0);
+            currentToken.Clear();
+            currentToken.Append(c);
+        }
+    }
+    // Handle any remaining text
+    if (currentToken.Length > 0)
+    {
+        tokens.Add(vocab.ContainsKey("<unk>") ? vocab["<unk>"] : 0);
+    }
+    return tokens.ToArray();
+}
+public string Decode(int[] ids)
+{
+    StringBuilder result = new StringBuilder();
+    foreach (int id in ids)
+    {
+        if (reversedVocab.TryGetValue(id, out string token))
+        {
+            // Some tokenizers use special prefixes like "Ġ" for spaces
+            string processedToken = token
+                .Replace("Ġ", " ")
+                .Replace("Ċ", "\n")
+                .Replace("▁", " ");
+            result.Append(processedToken);
+        }
+    }
+    return result.ToString();
+}
+}
+""")
+            # Calculate elapsed time
+            end_time = time.time()
+            duration = end_time - start_time
+            logger.info(f"✓ Conversion completed in {duration:.2f} seconds")
+            logger.info(f"✓ Final model size: {os.path.getsize(onnx_path) / (1024 * 1024):.2f} MB")
+            # Create a Python example usage file
+            example_path = os.path.join(model_dir, "example_usage.py")
+            with open(example_path, 'w') as f:
+                f.write("""
+import onnxruntime as ort
+from transformers import AutoTokenizer
+import numpy as np
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("./")  # Path to model directory
+session = ort.InferenceSession("./model.onnx")
+def generate_response(user_message, max_length=50):
+    # Format as a chat message
+    prompt = f"User: {user_message}\\nAssistant:"
+    inputs = tokenizer(prompt, return_tensors="np")
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    # Simple auto-regressive generation loop
+    for _ in range(max_length):
+        # Run inference for a single step
+        outputs = session.run(
+            ["logits"],
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask
+            }
+        )
+        # Get next token prediction from logits
+        logits = outputs[0]
+        next_token_logits = logits[0, -1, :]
+        # Apply temperature sampling
+        temperature = 0.7
+        next_token_logits = next_token_logits / temperature
+        # Apply softmax to get probabilities
+        exp_logits = np.exp(next_token_logits - np.max(next_token_logits))
+        probs = exp_logits / np.sum(exp_logits)
+        # Sample from the distribution
+        next_token_id = np.random.choice(probs.shape[0], p=probs)
+        # Stop if we hit the end of sequence token
+        if next_token_id == tokenizer.eos_token_id:
+            break
+        # Append new token to the input_ids
+        input_ids = np.concatenate([input_ids, [[next_token_id]]], axis=1)
+        attention_mask = np.concatenate([attention_mask, [[1]]], axis=1)
+    # Decode the entire response
+    response = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    # Extract only the assistant's response
+    if "Assistant:" in response:
+        response = response.split("Assistant:")[-1].strip()
+    return response
+# Example usage
+while True:
+    user_input = input("You: ")
+    if user_input.lower() in ['exit', 'quit']:
+        break
+    response = generate_response(user_input)
+    print(f"Assistant: {response}")
+""")
+            logger.info(f"✓ Example usage saved to {example_path}")
+            logger.info(f"✓ Unity integration example saved to {unity_example_path}")
+            return True
+        else:
+            logger.error(f"× ONNX file not created at {onnx_path}")
+            return False
+    except Exception as e:
+        logger.error(f"�� Error converting model: {str(e)}")
+        logger.error(traceback.format_exc())
+        return False
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser_available = False
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Convert HuggingFace models to ONNX for Unity")
+        parser.add_argument("model_id", type=str, help="HuggingFace model ID or path")
+        parser.add_argument("--output_dir", "-o", type=str, default="./onnx_models",
+                          help="Output directory for the converted model")
+        parser.add_argument("--seq_length", "-s", type=int, default=32,
+                          help="Sequence length for model export")
+        parser.add_argument("--no_quantize", action="store_true",
+                          help="Skip INT8 quantization step")
+        parser.add_argument("--opset", "-op", type=int, default=None,
+                          help="Force a specific ONNX opset version")
+        args = parser.parse_args()
+        parser_available = True
+        model_id = args.model_id
+        output_dir = args.output_dir
+        seq_length = args.seq_length
+        quantize = not args.no_quantize
+        force_opset = args.opset
+    except (ImportError, NameError):
+        # Fallback if argparse is not available
+        parser_available = False
+    if not parser_available:
+        if len(sys.argv) < 2:
+            print("Usage: python unity_compatible_converter.py MODEL_ID [OUTPUT_DIR] [SEQ_LENGTH] [--no-quantize] [--opset]")
+            print("Example: python unity_compatible_converter.py distilgpt2 ./onnx_models 32")
+            print("\nRecommended chat models for Unity:")
+            print("  - distilgpt2 (smallest, opset 11)")
+            print("  - EleutherAI/pythia-70m (better quality, opset 11)")
+            print("  - microsoft/phi-1 (high quality, opset 14)")
+            print("  - TinyLlama/TinyLlama-1.1B-Chat-v1.0 (chat-tuned, opset 14)")
+            sys.exit(1)
+        model_id = sys.argv[1]
+        output_dir = sys.argv[2] if len(sys.argv) > 2 else "./onnx_models"
+        seq_length = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+        quantize = "--no-quantize" not in sys.argv and "--no_quantize" not in sys.argv
+        force_opset = None
+        # Check for opset flag
+        for i, arg in enumerate(sys.argv):
+            if arg == "--opset" and i + 1 < len(sys.argv):
+                force_opset = int(sys.argv[i + 1])
+    # Check model architecture for automatic opset recommendation
+    is_compatible, recommended_opset = is_architecture_compatible(model_id)
+    # Print header
+    logger.info("\nUNITY-COMPATIBLE ONNX CONVERTER")
+    logger.info("===============================")
+    logger.info(f"Model: {model_id}")
+    logger.info(f"Output directory: {output_dir}")
+    logger.info(f"Sequence length: {seq_length}")
+    if force_opset is not None:
+        logger.info(f"ONNX opset version: {force_opset} (forced)")
+    else:
+        logger.info(f"Recommended ONNX opset: {recommended_opset}")
+        logger.info(f"Architecture compatible with opset 11: {'Yes' if is_compatible else 'No'}")
+    logger.info(f"Quantization: {'Enabled' if quantize else 'Disabled'}")
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Convert the model
+    success = convert_model(model_id, output_dir, seq_length, quantize, force_opset)
+    if success:
+        logger.info("\n" + "=" * 60)
+        logger.info("CONVERSION SUCCESSFUL")
+        logger.info("=" * 60)
+        logger.info(f"Model: {model_id}")
+        logger.info(f"Output directory: {os.path.abspath(output_dir)}")
+        logger.info("The model is ready for Unity integration!")
+        logger.info("\nNext steps:")
+        logger.info("1. Import the ONNX model into Unity using the Sentis package")
+        logger.info("2. Use the unity_integration.cs file as a starting point")
+        logger.info("3. For tokenization in Unity, implement the SimpleTokenizer class")
+    else:
+        logger.info("\n" + "=" * 60)
+        logger.info("CONVERSION FAILED")
+        logger.info("=" * 60)
+        logger.info("Please try one of the recommended models that work well with Unity:")
+        if is_compatible:
+            logger.info("Compatible with Unity (opset 11):")
+            logger.info("  - distilgpt2")
+            logger.info("  - EleutherAI/pythia-70m")
+        logger.info("Advanced models (require opset 14):")
+        logger.info("  - microsoft/phi-1 --opset 14")
+        logger.info("  - TinyLlama/TinyLlama-1.1B-Chat-v1.0 --opset 14")

old_scripts/convert_single_model.py ADDED Viewed

	@@ -0,0 +1,492 @@

+import os
+import gc
+import sys
+import time
+import logging
+import traceback
+import torch
+import warnings
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from tqdm import tqdm
+import onnx
+from onnxruntime.quantization import quantize_dynamic, QuantType
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+# Suppress unhelpful warnings
+warnings.filterwarnings("ignore", category=UserWarning, message=".*The shape of the input dimension.*")
+warnings.filterwarnings("ignore", category=UserWarning, message=".*Converting a tensor to a Python.*")
+warnings.filterwarnings("ignore", category=UserWarning, message=".*The model does not use GenerationMixin.*")
+class GenerationWrapper(torch.nn.Module):
+    """
+    Wrapper for model export that handles generation properly.
+    This ensures the model can be correctly used for text generation.
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = model.config
+    def forward(self, input_ids, attention_mask=None):
+        # Return only the logits to avoid complex structures
+        with torch.no_grad():
+            try:
+                # Standard approach for most models
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    use_cache=False,
+                    return_dict=True
+                )
+                return outputs.logits
+            except Exception as e:
+                logger.warning(f"Standard forward pass failed, trying fallback: {str(e)}")
+                # Fallback for models with different API
+                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+                if hasattr(outputs, 'logits'):
+                    return outputs.logits
+                elif isinstance(outputs, tuple) and len(outputs) > 0:
+                    return outputs[0]  # First element is typically logits
+                else:
+                    raise ValueError("Could not extract logits from model outputs")
+def verify_model_generation(model, tokenizer, device="cpu"):
+    """Test model generation capabilities before export"""
+    model.eval()
+    prompt = "Hello, how are you today? I am"
+    logger.info("Testing model generation...")
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    # Configure generation parameters
+    gen_config = GenerationConfig(
+        max_length=30,
+        do_sample=True,
+        temperature=0.7,
+        num_return_sequences=1,
+    )
+    try:
+        # Try generation
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                generation_config=gen_config
+            )
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        logger.info(f"Test generation result: {generated_text}")
+        if len(generated_text) <= len(prompt):
+            logger.warning("Generation output is not longer than input prompt!")
+        return True
+    except Exception as e:
+        logger.error(f"Generation test failed: {str(e)}")
+        return False
+def test_onnx_model(onnx_path, tokenizer):
+    """Verify the ONNX model can be loaded and run"""
+    try:
+        import onnxruntime as ort
+        logger.info("Testing ONNX model inference...")
+        session = ort.InferenceSession(onnx_path)
+        # Get input and output names
+        input_names = [input.name for input in session.get_inputs()]
+        output_names = [output.name for output in session.get_outputs()]
+        # Create test input
+        prompt = "Hello, how are you?"
+        inputs = tokenizer(prompt, return_tensors="np")
+        # Prepare input dict
+        onnx_inputs = {}
+        for name in input_names:
+            if name == "input_ids" and "input_ids" in inputs:
+                onnx_inputs[name] = inputs["input_ids"]
+            elif name == "attention_mask" and "attention_mask" in inputs:
+                onnx_inputs[name] = inputs["attention_mask"]
+        # Run inference
+        outputs = session.run(output_names, onnx_inputs)
+        # Check output shape
+        logits = outputs[0]
+        logger.info(f"ONNX model output shape: {logits.shape}")
+        if logits.shape[0] != 1 or logits.shape[1] != inputs["input_ids"].shape[1]:
+            logger.warning("Output shape doesn't match expected dimensions!")
+        # Test next token prediction
+        next_token_logits = logits[0, -1, :]
+        next_token_id = np.argmax(next_token_logits)
+        next_token = tokenizer.decode([next_token_id])
+        logger.info(f"Next predicted token: '{next_token}'")
+        return True
+    except Exception as e:
+        logger.error(f"ONNX model test failed: {str(e)}")
+        return False
+def optimize_onnx_model(onnx_path):
+    """Apply ONNX optimizations to improve performance"""
+    try:
+        logger.info("Optimizing ONNX model...")
+        # Load the model
+        model = onnx.load(onnx_path)
+        # Apply optimizations
+        from onnxruntime.transformers import optimizer
+        # Get model type from path
+        model_path = os.path.dirname(onnx_path)
+        model_name = os.path.basename(model_path).lower()
+        # Determine model type for optimization
+        if "gpt" in model_name:
+            model_type = "gpt2"
+        elif "opt" in model_name:
+            model_type = "opt"
+        elif "pythia" in model_name:
+            model_type = "gpt_neox"
+        else:
+            model_type = "gpt2"  # Default fallback
+        logger.info(f"Using optimization profile for model type: {model_type}")
+        # Try to optimize the model
+        try:
+            optimized_model = optimizer.optimize_model(
+                onnx_path,
+                model_type=model_type,
+                num_heads=8,  # Will be overridden by model's real config
+                hidden_size=768,  # Will be overridden by model's real config
+                optimization_options=None
+            )
+            optimized_model.save_model_to_file(onnx_path)
+            logger.info("✓ ONNX model optimized")
+            return True
+        except Exception as e:
+            logger.warning(f"Optimization failed (non-critical): {str(e)}")
+            return False
+    except Exception as e:
+        logger.warning(f"ONNX optimization error (skipping): {str(e)}")
+        return False
+def convert_model(model_id, output_dir="./onnx_models", seq_length=32, quantize=True):
+    """
+    Convert a model to ONNX format with focus on reliability for generation.
+    Args:
+        model_id: HuggingFace model ID or path
+        output_dir: Directory to save the model
+        seq_length: Input sequence length for export
+        quantize: Whether to quantize the model to INT8
+    Returns:
+        bool: Success status
+    """
+    start_time = time.time()
+    logger.info(f"\n{'=' * 60}")
+    logger.info(f"Converting {model_id} to ONNX (optimized for generation)")
+    logger.info(f"{'=' * 60}")
+    # Create output directory
+    model_name = model_id.split("/")[-1]
+    model_dir = os.path.join(output_dir, model_name)
+    os.makedirs(model_dir, exist_ok=True)
+    try:
+        # Step 1: Load tokenizer
+        logger.info("Step 1/6: Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        if tokenizer.pad_token is None and hasattr(tokenizer, 'eos_token'):
+            logger.info("Adding pad_token = eos_token")
+            tokenizer.pad_token = tokenizer.eos_token
+        # Save tokenizer
+        tokenizer.save_pretrained(model_dir)
+        logger.info(f"✓ Tokenizer saved to {model_dir}")
+        # Step 2: Load model with reliability optimizations
+        logger.info("Step 2/6: Loading model...")
+        # Clean memory
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Determine device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load model with full precision
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,  # Use full precision for reliability
+            low_cpu_mem_usage=True,    # Reduce memory usage
+            device_map=device         # Use CUDA if available
+        )
+        # Save config
+        model.config.save_pretrained(model_dir)
+        logger.info(f"✓ Model config saved to {model_dir}")
+        # Step 3: Verify model can generate text
+        logger.info("Step 3/6: Validating generation capabilities...")
+        if not verify_model_generation(model, tokenizer, device):
+            logger.warning("⚠ Model generation test didn't complete successfully")
+            logger.info("Continuing with export anyway...")
+        # Step 4: Wrap and prepare for export
+        logger.info("Step 4/6: Preparing for export...")
+        # Wrap model with generation-optimized interface
+        wrapped_model = GenerationWrapper(model)
+        wrapped_model.eval()
+        # Clean memory again
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Step 5: Export to ONNX
+        logger.info("Step 5/6: Exporting to ONNX format...")
+        onnx_path = os.path.join(model_dir, "model.onnx")
+        # Create minimal input
+        batch_size = 1
+        dummy_input = torch.ones(batch_size, seq_length, dtype=torch.long)
+        attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long)
+        # Move tensors to correct device
+        dummy_input = dummy_input.to(device)
+        attention_mask = attention_mask.to(device)
+        # Export to ONNX with required opset for transformer models
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model,                # Wrapped model
+                (dummy_input, attention_mask), # Input tensors
+                onnx_path,                    # Output path
+                export_params=True,           # Store weights
+                opset_version=14,             # Required for transformer models
+                do_constant_folding=True,     # Optimize constants
+                input_names=['input_ids', 'attention_mask'],  # Input names
+                output_names=['logits'],      # Output name
+                dynamic_axes={                # Dynamic dimensions
+                    'input_ids': {0: 'batch_size', 1: 'sequence'},
+                    'attention_mask': {0: 'batch_size', 1: 'sequence'},
+                    'logits': {0: 'batch_size', 1: 'sequence'}
+                }
+            )
+        # Clean up to save memory
+        del model
+        del wrapped_model
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Verify export success
+        if os.path.exists(onnx_path):
+            size_mb = os.path.getsize(onnx_path) / (1024 * 1024)
+            logger.info(f"✓ ONNX model saved to {onnx_path}")
+            logger.info(f"✓ Original size: {size_mb:.2f} MB")
+            # Test ONNX model
+            test_onnx_model(onnx_path, tokenizer)
+            # Optimize the ONNX model
+            optimize_onnx_model(onnx_path)
+            # Step 6: Quantize the model (optional)
+            if quantize:
+                logger.info("Step 6/6: Applying INT8 quantization...")
+                quant_path = onnx_path.replace(".onnx", "_quantized.onnx")
+                try:
+                    with tqdm(total=100, desc="Quantizing") as pbar:
+                        # Update progress callback
+                        def update_progress(x):
+                            pbar.update(1)
+                        quantize_dynamic(
+                            model_input=onnx_path,
+                            model_output=quant_path,
+                            per_channel=False,
+                            reduce_range=False,
+                            weight_type=QuantType.QInt8,
+                            optimize_model=True,
+                            use_external_data_format=False
+                        )
+                        pbar.update(100)  # Ensure progress reaches 100%
+                    if os.path.exists(quant_path):
+                        quant_size = os.path.getsize(quant_path) / (1024 * 1024)
+                        logger.info(f"✓ Quantized size: {quant_size:.2f} MB")
+                        logger.info(f"✓ Size reduction: {(1 - quant_size/size_mb) * 100:.1f}%")
+                        # Test the quantized model
+                        test_onnx_model(quant_path, tokenizer)
+                        # Rename original as backup
+                        backup_path = onnx_path.replace(".onnx", "_fp32.onnx")
+                        os.rename(onnx_path, backup_path)
+                        # Replace original with quantized
+                        os.rename(quant_path, onnx_path)
+                        logger.info("✓ Original model preserved as *_fp32.onnx")
+                        logger.info("✓ Replaced original with quantized version")
+                    else:
+                        logger.warning("⚠ Quantized file not created, using original")
+                except Exception as e:
+                    logger.error(f"⚠ Quantization error: {str(e)}")
+                    logger.info("⚠ Using original model without quantization")
+            else:
+                logger.info("Step 6/6: Skipping quantization as requested")
+            # Calculate elapsed time
+            end_time = time.time()
+            duration = end_time - start_time
+            logger.info(f"✓ Conversion completed in {duration:.2f} seconds")
+            logger.info(f"✓ Final model size: {os.path.getsize(onnx_path) / (1024 * 1024):.2f} MB")
+            # Create a simple example usage file
+            example_path = os.path.join(model_dir, "example_usage.py")
+            with open(example_path, 'w') as f:
+                f.write("""
+import onnxruntime as ort
+from transformers import AutoTokenizer
+import numpy as np
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("./")  # Path to model directory
+session = ort.InferenceSession("./model.onnx")
+# Prepare input
+prompt = "Hello, how are you?"
+inputs = tokenizer(prompt, return_tensors="np")
+# Run inference for a single step
+outputs = session.run(
+    ["logits"],
+    {
+        "input_ids": inputs["input_ids"],
+        "attention_mask": inputs["attention_mask"]
+    }
+)
+# Get next token prediction
+logits = outputs[0]
+next_token_id = np.argmax(logits[0, -1, :])
+next_token = tokenizer.decode([next_token_id])
+print(f"Next predicted token: {next_token}")
+# For full generation, you'd typically run in a loop, adding tokens one by one
+""")
+            logger.info(f"✓ Example usage saved to {example_path}")
+            return True
+        else:
+            logger.error(f"× ONNX file not created at {onnx_path}")
+            return False
+    except Exception as e:
+        logger.error(f"× Error converting model: {str(e)}")
+        logger.error(traceback.format_exc())
+        return False
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser_available = False
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Convert HuggingFace models to ONNX for generation")
+        parser.add_argument("model_id", type=str, help="HuggingFace model ID or path")
+        parser.add_argument("--output_dir", "-o", type=str, default="./onnx_models",
+                          help="Output directory for the converted model")
+        parser.add_argument("--seq_length", "-s", type=int, default=32,
+                          help="Sequence length for model export")
+        parser.add_argument("--no_quantize", action="store_true",
+                          help="Skip INT8 quantization step")
+        args = parser.parse_args()
+        parser_available = True
+        model_id = args.model_id
+        output_dir = args.output_dir
+        seq_length = args.seq_length
+        quantize = not args.no_quantize
+    except (ImportError, NameError):
+        # Fallback if argparse is not available
+        parser_available = False
+    if not parser_available:
+        if len(sys.argv) < 2:
+            print("Usage: python convert_model.py MODEL_ID [OUTPUT_DIR] [SEQ_LENGTH] [--no-quantize]")
+            print("Example: python convert_model.py facebook/opt-125m ./onnx_models 32")
+            print("\nRecommended models for small hardware:")
+            print("  - facebook/opt-125m")
+            print("  - distilgpt2")
+            print("  - TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+            print("  - EleutherAI/pythia-70m")
+            sys.exit(1)
+        model_id = sys.argv[1]
+        output_dir = sys.argv[2] if len(sys.argv) > 2 else "./onnx_models"
+        seq_length = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+        quantize = "--no-quantize" not in sys.argv and "--no_quantize" not in sys.argv
+    # Print header
+    logger.info("\nENHANCED ONNX CONVERTER FOR LANGUAGE MODELS")
+    logger.info("============================================")
+    logger.info(f"Model: {model_id}")
+    logger.info(f"Output directory: {output_dir}")
+    logger.info(f"Sequence length: {seq_length}")
+    logger.info(f"Quantization: {'Enabled' if quantize else 'Disabled'}")
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Convert the model
+    success = convert_model(model_id, output_dir, seq_length, quantize)
+    if success:
+        logger.info("\n" + "=" * 60)
+        logger.info("CONVERSION SUCCESSFUL")
+        logger.info("=" * 60)
+        logger.info(f"Model: {model_id}")
+        logger.info(f"Output directory: {os.path.abspath(output_dir)}")
+        logger.info("The model is ready for generation!")
+        logger.info("\nTo use the model:")
+        logger.info("1. See the example_usage.py file in the model directory")
+        logger.info("2. For chatbot applications, implement token-by-token generation")
+    else:
+        logger.error("\n" + "=" * 60)
+        logger.error("CONVERSION FAILED")
+        logger.error("=" * 60)
+        logger.error("Please try one of the recommended models:")
+        logger.error("  - facebook/opt-125m")
+        logger.error("  - distilgpt2")
+        logger.error("  - EleutherAI/pythia-70m")

old_scripts/convert_to_onnx.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import os
+import gc
+import sys
+import time
+import logging
+import traceback
+import torch
+import warnings
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from onnxruntime.quantization import quantize_dynamic, QuantType
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+# Suppress specific warnings
+warnings.filterwarnings("ignore", category=UserWarning, message=".*The shape of the input dimension.*")
+warnings.filterwarnings("ignore", category=UserWarning, message=".*Converting a tensor to a Python.*")
+# Models that are known to work well with ONNX conversion
+RELIABLE_MODELS = [
+    {
+        "id": "facebook/opt-350m",
+        "description": "Well-balanced model (350M) for RAG and chatbots"
+    },
+    {
+        "id": "gpt2",
+        "description": "Very reliable model (124M) with excellent ONNX compatibility"
+    },
+    {
+        "id": "distilgpt2",
+        "description": "Lightweight (82M) model with good performance"
+    }
+]
+class ModelWrapper(torch.nn.Module):
+    """
+    Wrapper to handle ONNX export compatibility issues.
+    This wrapper specifically:
+    1. Bypasses cache handling
+    2. Simplifies the forward pass to avoid dynamic operations
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, input_ids):
+        # Force no cache, no gradient, and no special features
+        with torch.no_grad():
+            return self.model(input_ids=input_ids, use_cache=False, return_dict=False)[0]
+def convert_model(model_id, output_dir, quantize=True):
+    """Convert a model to ONNX format with maximum compatibility."""
+    start_time = time.time()
+    logger.info(f"\n{'=' * 60}")
+    logger.info(f"Converting {model_id} to ONNX")
+    logger.info(f"{'=' * 60}")
+    # Create output directory
+    model_name = model_id.split("/")[-1]
+    model_dir = os.path.join(output_dir, model_name)
+    os.makedirs(model_dir, exist_ok=True)
+    try:
+        # Step 1: Load tokenizer
+        logger.info("Step 1/5: Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # Handle missing pad token
+        if tokenizer.pad_token is None and hasattr(tokenizer, 'eos_token'):
+            logger.info("Adding pad_token = eos_token")
+            tokenizer.pad_token = tokenizer.eos_token
+        # Save tokenizer
+        tokenizer.save_pretrained(model_dir)
+        logger.info(f"✓ Tokenizer saved to {model_dir}")
+        # Step 2: Load model with memory optimizations
+        logger.info("Step 2/5: Loading model with memory optimizations...")
+        # Clean memory before loading
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Load model with optimizations
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,  # Use half precision
+            low_cpu_mem_usage=True      # Reduce memory usage
+        )
+        # Save config for reference
+        model.config.save_pretrained(model_dir)
+        logger.info(f"✓ Model config saved to {model_dir}")
+        # Step 3: Prepare for export
+        logger.info("Step 3/5: Preparing for export...")
+        # Wrap model to avoid tracing issues
+        wrapped_model = ModelWrapper(model)
+        wrapped_model.eval()  # Set to evaluation mode
+        # Clean memory again
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Step 4: Export to ONNX
+        logger.info("Step 4/5: Exporting to ONNX format...")
+        onnx_path = os.path.join(model_dir, "model.onnx")
+        # Create dummy input
+        batch_size = 1
+        seq_length = 8  # Small sequence length to reduce memory
+        dummy_input = torch.ones(batch_size, seq_length, dtype=torch.long)
+        # Export to ONNX format with new opset version
+        torch.onnx.export(
+            wrapped_model,             # Use wrapped model
+            dummy_input,               # Model input
+            onnx_path,                 # Output path
+            export_params=True,        # Store model weights
+            opset_version=14,          # ONNX opset version (changed from 13 to 14)
+            do_constant_folding=True,  # Optimize constants
+            input_names=['input_ids'], # Input names
+            output_names=['logits'],   # Output names
+            dynamic_axes={
+                'input_ids': {0: 'batch_size', 1: 'sequence'},
+                'logits': {0: 'batch_size', 1: 'sequence'}
+            }
+        )
+        # Clean up to save memory
+        del model
+        del wrapped_model
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # Verify export was successful
+        if os.path.exists(onnx_path):
+            size_mb = os.path.getsize(onnx_path) / (1024 * 1024)
+            logger.info(f"✓ ONNX model saved to {onnx_path}")
+            logger.info(f"✓ Original size: {size_mb:.2f} MB")
+            # Step 5: Quantize
+            if quantize:
+                logger.info("Step 5/5: Applying int8 quantization...")
+                quant_path = onnx_path.replace(".onnx", "_quantized.onnx")
+                try:
+                    quantize_dynamic(
+                        model_input=onnx_path,
+                        model_output=quant_path,
+                        per_channel=False,
+                        reduce_range=False,
+                        weight_type=QuantType.QInt8
+                    )
+                    if os.path.exists(quant_path):
+                        quant_size = os.path.getsize(quant_path) / (1024 * 1024)
+                        logger.info(f"✓ Quantized size: {quant_size:.2f} MB")
+                        logger.info(f"✓ Size reduction: {(1 - quant_size/size_mb) * 100:.1f}%")
+                        # Replace original with quantized to save space
+                        os.replace(quant_path, onnx_path)
+                        logger.info("✓ Replaced original with quantized version")
+                    else:
+                        logger.warning("⚠ Quantized file not created, using original")
+                except Exception as e:
+                    logger.error(f"⚠ Quantization error: {str(e)}")
+                    logger.info("⚠ Using original model without quantization")
+            else:
+                logger.info("Step 5/5: Skipping quantization (not requested)")
+            # Calculate elapsed time
+            end_time = time.time()
+            duration = end_time - start_time
+            logger.info(f"✓ Conversion completed in {duration:.2f} seconds")
+            return {
+                "success": True,
+                "model_id": model_id,
+                "size_mb": os.path.getsize(onnx_path) / (1024 * 1024),
+                "duration_seconds": duration,
+                "output_dir": model_dir
+            }
+        else:
+            logger.error(f"× ONNX file not created at {onnx_path}")
+            return {
+                "success": False,
+                "model_id": model_id,
+                "error": "ONNX file not created"
+            }
+    except Exception as e:
+        logger.error(f"× Error converting model: {str(e)}")
+        logger.error(traceback.format_exc())
+        return {
+            "success": False,
+            "model_id": model_id,
+            "error": str(e)
+        }
+def main():
+    """Convert all reliable models."""
+    # Print header
+    logger.info("\nGUARANTEED ONNX CONVERTER")
+    logger.info("======================")
+    logger.info("Using reliable models with proven ONNX compatibility")
+    # Create output directory
+    output_dir = "./onnx_models"
+    os.makedirs(output_dir, exist_ok=True)
+    # Check if specific model ID provided as argument
+    if len(sys.argv) > 1:
+        model_id = sys.argv[1]
+        logger.info(f"Converting single model: {model_id}")
+        convert_model(model_id, output_dir)
+        return
+    # Convert all reliable models
+    results = []
+    for model_info in RELIABLE_MODELS:
+        model_id = model_info["id"]
+        logger.info(f"Processing model: {model_id}")
+        logger.info(f"Description: {model_info['description']}")
+        result = convert_model(model_id, output_dir)
+        results.append(result)
+    # Print summary
+    logger.info("\n" + "=" * 60)
+    logger.info("CONVERSION SUMMARY")
+    logger.info("=" * 60)
+    success_count = 0
+    for result in results:
+        if result.get("success", False):
+            success_count += 1
+            size_info = f" - Size: {result.get('size_mb', 0):.2f} MB"
+            time_info = f" - Time: {result.get('duration_seconds', 0):.2f}s"
+            logger.info(f"✓ SUCCESS: {result['model_id']}{size_info}{time_info}")
+        else:
+            logger.info(f"× FAILED: {result['model_id']} - Error: {result.get('error', 'Unknown error')}")
+    logger.info(f"\nSuccessfully converted {success_count}/{len(RELIABLE_MODELS)} models")
+    logger.info(f"Models saved to: {os.path.abspath(output_dir)}")
+    if success_count > 0:
+        logger.info("\nThe models are ready for RAG and chatbot applications!")
+if __name__ == "__main__":
+    main()

old_scripts/test_chat.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import os
+import sys
+import time
+import argparse
+import logging
+import numpy as np
+import onnxruntime as ort
+from transformers import AutoTokenizer
+from tqdm import tqdm
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+class ONNXGenerationChatbot:
+    def __init__(self, model_path, max_length=100):
+        """
+        Initialize the ONNX chatbot for text generation.
+        Args:
+            model_path: Path to the directory containing the ONNX model and tokenizer
+            max_length: Maximum sequence length for generation
+        """
+        # Set up model paths
+        self.model_dir = model_path
+        self.onnx_path = os.path.join(self.model_dir, "model.onnx")
+        self.fp32_path = os.path.join(self.model_dir, "model_fp32.onnx")
+        # Check for model files
+        if not os.path.exists(self.onnx_path):
+            raise FileNotFoundError(f"ONNX model not found at {self.onnx_path}")
+        # Get model name for prompt formatting
+        self.model_name = os.path.basename(os.path.normpath(model_path))
+        logger.info(f"Using model: {self.model_name}")
+        # Load tokenizer
+        logger.info(f"Loading tokenizer from {self.model_dir}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir, local_files_only=True)
+        # Ensure tokenizer has necessary tokens
+        if self.tokenizer.pad_token is None and hasattr(self.tokenizer, 'eos_token'):
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Create optimized session
+        logger.info(f"Loading ONNX model from {self.onnx_path}...")
+        self.session_options = ort.SessionOptions()
+        self.session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        self.session_options.intra_op_num_threads = 4  # Adjust based on your CPU
+        # Create session with appropriate providers
+        providers = ['CPUExecutionProvider']
+        if 'CUDAExecutionProvider' in ort.get_available_providers():
+            logger.info("CUDA is available! Using GPU acceleration.")
+            providers.insert(0, 'CUDAExecutionProvider')
+        self.session = ort.InferenceSession(
+            self.onnx_path,
+            sess_options=self.session_options,
+            providers=providers
+        )
+        # Get input and output names from the model
+        self.input_names = [input.name for input in self.session.get_inputs()]
+        self.output_names = [output.name for output in self.session.get_outputs()]
+        logger.info(f"Model inputs: {self.input_names}")
+        logger.info(f"Model outputs: {self.output_names}")
+        # Settings
+        self.max_length = max_length
+        self.stop_tokens = [self.tokenizer.eos_token_id] if self.tokenizer.eos_token_id is not None else []
+        # Try to add common stop tokens if they exist in the vocabulary
+        stop_words = ["<|endoftext|>", "</s>", "<|end|>"]
+        for word in stop_words:
+            try:
+                token_id = self.tokenizer.convert_tokens_to_ids(word)
+                if token_id not in self.stop_tokens and token_id != self.tokenizer.unk_token_id:
+                    self.stop_tokens.append(token_id)
+            except:
+                pass
+        logger.info(f"Using stop tokens: {self.stop_tokens}")
+        # Conversation history for context
+        self.conversation_history = []
+    def get_prompt_template(self):
+        """
+        Get the appropriate prompt template based on the model type.
+        """
+        if "opt" in self.model_name.lower():
+            return "Human: {}\nAssistant:"
+        elif "pythia" in self.model_name.lower():
+            return "USER: {}\nASSISTANT:"
+        elif "llama" in self.model_name.lower() or "alpaca" in self.model_name.lower():
+            return "### Human: {}\n### Assistant:"
+        elif "gpt2" in self.model_name.lower() or "distilgpt2" in self.model_name.lower():
+            return "User: {}\nBot:"
+        else:
+            return "Question: {}\nAnswer:"
+    def format_prompt_with_history(self, user_message):
+        """
+        Format the prompt with conversation history for better context.
+        """
+        template = self.get_prompt_template()
+        parts = template.split("{}")
+        prefix = parts[0]
+        suffix = parts[1] if len(parts) > 1 else ""
+        # Include history if available (up to 3 turns)
+        formatted_prompt = ""
+        for i, (user, bot) in enumerate(self.conversation_history[-3:]):
+            formatted_prompt += f"{prefix}{user}{suffix} {bot}\n\n"
+        # Add current user message
+        formatted_prompt += f"{prefix}{user_message}{suffix}"
+        return formatted_prompt
+    def run_inference_step(self, input_ids, attention_mask=None):
+        """
+        Run a single inference step with the ONNX model.
+        Args:
+            input_ids: Token IDs of the input sequence
+            attention_mask: Attention mask for the input sequence
+        Returns:
+            numpy array: Logits for the next token prediction
+        """
+        # Prepare model inputs
+        model_inputs = {}
+        for name in self.input_names:
+            if name == "input_ids":
+                model_inputs[name] = input_ids
+            elif name == "attention_mask" and attention_mask is not None:
+                model_inputs[name] = attention_mask
+        # Run inference
+        outputs = self.session.run(self.output_names, model_inputs)
+        # Return logits (assumes first output is logits)
+        return outputs[0]
+    def generate_text(self, prompt, max_new_tokens=50, temperature=0.7, top_k=50, top_p=0.9,
+                     repetition_penalty=1.1, do_sample=True, show_progress=True):
+        """
+        Generate text using the ONNX model.
+        Args:
+            prompt: Text prompt to generate from
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Temperature for sampling (higher = more random)
+            top_k: Number of highest probability tokens to keep for sampling
+            top_p: Cumulative probability threshold for nucleus sampling
+            repetition_penalty: Penalty for repeating tokens
+            do_sample: Whether to sample from the distribution or use greedy decoding
+            show_progress: Whether to show a progress bar during generation
+        Returns:
+            str: Generated text
+        """
+        # Encode the prompt
+        encoded = self.tokenizer(prompt, return_tensors="np")
+        input_ids = encoded["input_ids"]
+        attention_mask = encoded["attention_mask"]
+        # Track input tokens for repetition penalty
+        prev_tokens = input_ids[0].tolist()
+        # Setup progress bar if requested
+        progress = tqdm(total=max_new_tokens, desc="Generating") if show_progress else None
+        # Generate tokens auto-regressively
+        for _ in range(max_new_tokens):
+            # Run inference to get next token logits
+            logits = self.run_inference_step(input_ids, attention_mask)
+            # Get logits for the last token
+            next_token_logits = logits[0, -1, :]
+            # Apply temperature scaling
+            if temperature > 0:
+                next_token_logits = next_token_logits / max(temperature, 1e-8)
+            # Apply repetition penalty
+            if repetition_penalty > 1.0:
+                for prev_token in set(prev_tokens[-10:]):  # Only consider recent tokens
+                    if prev_token < len(next_token_logits):
+                        next_token_logits[prev_token] /= repetition_penalty
+            # Apply top-k filtering
+            if top_k > 0:
+                indices_to_remove = np.argsort(next_token_logits)[:-top_k]
+                next_token_logits[indices_to_remove] = -float('inf')
+            # Apply top-p (nucleus) filtering
+            if 0 < top_p < 1.0:
+                sorted_logits = np.sort(next_token_logits)[::-1]
+                sorted_indices = np.argsort(next_token_logits)[::-1]
+                cumulative_probs = np.cumsum(np.exp(sorted_logits) / np.sum(np.exp(sorted_logits)))
+                # Remove tokens with cumulative probability above the threshold
+                sorted_indices_to_remove = sorted_indices[cumulative_probs > top_p]
+                next_token_logits[sorted_indices_to_remove] = -float('inf')
+            # Sample from the filtered distribution or use greedy decoding
+            if do_sample:
+                # Apply softmax to get probabilities
+                probs = np.exp(next_token_logits - np.max(next_token_logits))
+                probs = probs / np.sum(probs)
+                # Handle NaNs
+                if np.isnan(probs).any():
+                    next_token_id = np.argmax(next_token_logits)
+                else:
+                    try:
+                        # Sample from the distribution
+                        next_token_id = np.random.choice(len(probs), p=probs)
+                    except:
+                        # Fallback to greedy if sampling fails
+                        next_token_id = np.argmax(next_token_logits)
+            else:
+                # Greedy decoding - take highest probability token
+                next_token_id = np.argmax(next_token_logits)
+            # Add the chosen token to the input
+            next_token = np.array([[next_token_id]])
+            input_ids = np.concatenate([input_ids, next_token], axis=1)
+            # Update attention mask
+            attention_mask = np.ones((1, input_ids.shape[1]), dtype=np.int64)
+            # Add token to history for repetition penalty
+            prev_tokens.append(int(next_token_id))
+            # Update progress bar if active
+            if progress is not None:
+                progress.update(1)
+            # Check for stop tokens or end of text
+            if next_token_id in self.stop_tokens:
+                break
+            # Also stop if we exceed max length
+            if input_ids.shape[1] >= self.max_length:
+                break
+        # Close progress bar if used
+        if progress is not None:
+            progress.close()
+        # Decode the full sequence
+        generated_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        return generated_text
+    def extract_assistant_response(self, full_text, prompt):
+        """
+        Extract just the assistant's response from the full generated text.
+        Args:
+            full_text: Full generated text including prompt
+            prompt: The original prompt
+        Returns:
+            str: Just the assistant's response
+        """
+        # Try to extract based on the prompt format
+        template = self.get_prompt_template()
+        response_start_marker = template.split("{}")[-1]
+        # If the prompt is in the text, extract everything after it
+        if prompt in full_text:
+            after_prompt = full_text[len(prompt):]
+            # Handle additional newlines or spaces at the beginning
+            return after_prompt.lstrip()
+        # If the response marker is in the text, extract everything after it
+        if response_start_marker.strip() in full_text:
+            parts = full_text.split(response_start_marker.strip(), 1)
+            if len(parts) > 1:
+                return parts[1].strip()
+        # Fallback: return everything after the last line of the prompt
+        prompt_last_line = prompt.strip().split('\n')[-1]
+        if prompt_last_line in full_text:
+            parts = full_text.split(prompt_last_line, 1)
+            if len(parts) > 1:
+                return parts[1].strip()
+        # Last resort: return the whole thing
+        return full_text
+    def chat(self, temperature=0.7, max_new_tokens=100):
+        """
+        Run an interactive chat session with the model.
+        Args:
+            temperature: Temperature for text generation
+            max_new_tokens: Maximum number of tokens to generate per response
+        """
+        print("\n===== ONNX Generation Chatbot =====")
+        print(f"Model: {self.model_name}")
+        print(f"Type 'exit' to end the conversation")
+        print(f"Type 'reset' to clear conversation history")
+        while True:
+            # Get user input
+            user_input = input("\nYou: ")
+            # Check for exit command
+            if user_input.lower() in ["exit", "quit", "bye"]:
+                print("Goodbye!")
+                break
+            # Check for reset command
+            if user_input.lower() == "reset":
+                self.conversation_history = []
+                print("Conversation history cleared.")
+                continue
+            # Create prompt with history
+            prompt = self.format_prompt_with_history(user_input)
+            print("\nGenerating response...")
+            # Generate text
+            try:
+                start_time = time.time()
+                full_text = self.generate_text(
+                    prompt,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    show_progress=True
+                )
+                # Extract just the assistant's response
+                response = self.extract_assistant_response(full_text, prompt)
+                # Clean up any trailing incomplete sentences
+                if response and len(response) > 0:
+                    # Try to end at a sentence boundary if possible
+                    sentence_end = max(
+                        response.rfind('.'),
+                        response.rfind('!'),
+                        response.rfind('?')
+                    )
+                    if sentence_end > len(response) * 0.5:  # Only trim if we're not losing too much
+                        response = response[:sentence_end+1]
+                # Calculate generation time
+                gen_time = time.time() - start_time
+                gen_speed = max_new_tokens / gen_time if gen_time > 0 else 0
+                # Print the response
+                print(f"\nBot: {response}")
+                print(f"\n[Generated {len(response)} chars in {gen_time:.2f}s ({gen_speed:.1f} tokens/sec)]")
+                # Add to conversation history
+                self.conversation_history.append((user_input, response))
+                # Keep history at a reasonable size
+                if len(self.conversation_history) > 10:
+                    self.conversation_history = self.conversation_history[-10:]
+            except Exception as e:
+                logger.error(f"Error generating response: {str(e)}")
+                print("\nBot: I encountered an error while generating a response. Let's try again.")
+def main():
+    """Run the ONNX chatbot with command line arguments."""
+    parser = argparse.ArgumentParser(description="Interactive ONNX Chatbot")
+    parser.add_argument("--model", type=str, required=True,
+                       help="Path to the ONNX model directory")
+    parser.add_argument("--temperature", type=float, default=0.7,
+                       help="Temperature for text generation (default: 0.7)")
+    parser.add_argument("--max_tokens", type=int, default=100,
+                       help="Maximum tokens to generate per response (default: 100)")
+    args = parser.parse_args()
+    try:
+        # Create and run the chatbot
+        chatbot = ONNXGenerationChatbot(args.model)
+        chatbot.chat(temperature=args.temperature, max_new_tokens=args.max_tokens)
+    except KeyboardInterrupt:
+        print("\nExiting chatbot. Goodbye!")
+    except Exception as e:
+        logger.error(f"Error: {str(e)}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

onnx_models/bloom_onnx/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "bigscience/bloom-560m",
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "BloomForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "masked_softmax_fusion": true,
+  "model_type": "bloom",
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "offset_alibi": 100,
+  "pad_token_id": 3,
+  "pretraining_tp": 1,
+  "skip_bias_add": true,
+  "skip_bias_add_qkv": false,
+  "slow_but_exact": false,
+  "transformers_version": "4.48.3",
+  "unk_token_id": 0,
+  "use_cache": true,
+  "vocab_size": 250880
+}

onnx_models/bloom_onnx/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 3,
+  "transformers_version": "4.48.3"
+}

onnx_models/bloom_onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:268cdaf473da19cc5cb7f1c0eef597e3719dc88524ebc4e78b51268cfcdb8d28
+size 798372

onnx_models/bloom_onnx/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx_models/bloom_onnx/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
+size 21763085

onnx_models/bloom_onnx/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "merges_file": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "tokenizer_class": "BloomTokenizer",
+  "unk_token": "<unk>",
+  "vocab_file": null
+}

onnx_models/bloom_onnx_quantized/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "onnx_models/bloom_onnx",
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "BloomForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "masked_softmax_fusion": true,
+  "model_type": "bloom",
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "offset_alibi": 100,
+  "pad_token_id": 3,
+  "pretraining_tp": 1,
+  "skip_bias_add": true,
+  "skip_bias_add_qkv": false,
+  "slow_but_exact": false,
+  "transformers_version": "4.48.3",
+  "unk_token_id": 0,
+  "use_cache": true,
+  "vocab_size": 250880
+}

onnx_models/bloom_onnx_quantized/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:179e57ab6bb5a39b3feef242d2d569aa321f8f1461ec5247c1bd980444b07419
+size 561463713

onnx_models/bloom_onnx_quantized/ort_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "one_external_file": true,
+  "opset": null,
+  "optimization": {},
+  "quantization": {
+    "activations_dtype": "QUInt8",
+    "activations_symmetric": false,
+    "format": "QOperator",
+    "is_static": false,
+    "mode": "IntegerOps",
+    "nodes_to_exclude": [],
+    "nodes_to_quantize": [],
+    "operators_to_quantize": [
+      "Conv",
+      "MatMul",
+      "Attention",
+      "LSTM",
+      "Gather",
+      "Transpose",
+      "EmbedLayerNormalization"
+    ],
+    "per_channel": false,
+    "qdq_add_pair_to_weight": false,
+    "qdq_dedicated_pair": false,
+    "qdq_op_type_per_channel_support_to_axis": {
+      "MatMul": 1
+    },
+    "reduce_range": false,
+    "weights_dtype": "QInt8",
+    "weights_symmetric": true
+  },
+  "use_external_data_format": false
+}

onnx_models/bloom_onnx_quantized/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx_models/bloom_onnx_quantized/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
+size 21763085

onnx_models/bloom_onnx_quantized/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "merges_file": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "tokenizer_class": "BloomTokenizer",
+  "unk_token": "<unk>",
+  "vocab_file": null
+}

onnx_models/falcon_onnx/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "tiiuae/falcon-rw-1b",
+  "activation": "gelu",
+  "alibi": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "FalconForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "tiiuae/falcon-rw-1b--configuration_falcon.FalconConfig",
+    "AutoModel": "tiiuae/falcon-rw-1b--modeling_falcon.FalconModel",
+    "AutoModelForCausalLM": "tiiuae/falcon-rw-1b--modeling_falcon.FalconForCausalLM",
+    "AutoModelForQuestionAnswering": "tiiuae/falcon-rw-1b--modeling_falcon.FalconForQuestionAnswering",
+    "AutoModelForSequenceClassification": "tiiuae/falcon-rw-1b--modeling_falcon.FalconForSequenceClassification",
+    "AutoModelForTokenClassification": "tiiuae/falcon-rw-1b--modeling_falcon.FalconForTokenClassification"
+  },
+  "bias": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 8192,
+  "hidden_dropout": 0.0,
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "falcon",
+  "multi_query": false,
+  "new_decoder_architecture": false,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 24,
+  "num_kv_heads": 32,
+  "num_ln_in_parallel_attn": null,
+  "parallel_attn": false,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "transformers_version": "4.48.3",
+  "use_cache": true,
+  "vocab_size": 50304
+}

onnx_models/falcon_onnx/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.48.3"
+}

onnx_models/falcon_onnx/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/falcon_onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c26d8b62a099f87043745987be680556c9a7c0324944af78d58b7f8559b73c17
+size 655121

onnx_models/falcon_onnx/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx_models/falcon_onnx/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/falcon_onnx/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

onnx_models/falcon_onnx/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/gpt2_onnx/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "gpt2-medium",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_special": 0,
+  "predict_special_tokens": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "transformers_version": "4.48.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

onnx_models/gpt2_onnx/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.48.3"
+}

onnx_models/gpt2_onnx/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/gpt2_onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7c266101cd0fb1a383a3006369c88384d5f76081ec1b9a4e76ff4b7bc15ffe6
+size 1420150742

onnx_models/gpt2_onnx/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

onnx_models/gpt2_onnx/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/gpt2_onnx/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

onnx_models/gpt2_onnx/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/gpt2_onnx_quantized/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "onnx_models/gpt2_onnx",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_special": 0,
+  "predict_special_tokens": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "transformers_version": "4.48.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

onnx_models/gpt2_onnx_quantized/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/gpt2_onnx_quantized/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b051bd6632d5039e281be589c65e56abc86861340f41d044f49f496afe35aa07
+size 357201134

onnx_models/gpt2_onnx_quantized/ort_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "one_external_file": true,
+  "opset": null,
+  "optimization": {},
+  "quantization": {
+    "activations_dtype": "QUInt8",
+    "activations_symmetric": false,
+    "format": "QOperator",
+    "is_static": false,
+    "mode": "IntegerOps",
+    "nodes_to_exclude": [],
+    "nodes_to_quantize": [],
+    "operators_to_quantize": [
+      "Conv",
+      "MatMul",
+      "Attention",
+      "LSTM",
+      "Gather",
+      "Transpose",
+      "EmbedLayerNormalization"
+    ],
+    "per_channel": false,
+    "qdq_add_pair_to_weight": false,
+    "qdq_dedicated_pair": false,
+    "qdq_op_type_per_channel_support_to_axis": {
+      "MatMul": 1
+    },
+    "reduce_range": false,
+    "weights_dtype": "QInt8",
+    "weights_symmetric": true
+  },
+  "use_external_data_format": false
+}

onnx_models/gpt2_onnx_quantized/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx_models/gpt2_onnx_quantized/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/gpt2_onnx_quantized/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

onnx_models/gpt2_onnx_quantized/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/opt_onnx/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "facebook/opt-350m",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "OPTForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "do_layer_norm_before": false,
+  "dropout": 0.1,
+  "enable_bias": true,
+  "eos_token_id": 2,
+  "ffn_dim": 4096,
+  "hidden_size": 1024,
+  "init_std": 0.02,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "model_type": "opt",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "prefix": "</s>",
+  "transformers_version": "4.48.3",
+  "use_cache": true,
+  "vocab_size": 50272,
+  "word_embed_proj_dim": 512
+}

onnx_models/opt_onnx/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.48.3"
+}

onnx_models/opt_onnx/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_models/opt_onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33dbbbda8ead8a71ec8ad090902faadf3b292e64f4641087efe17996e9b85aa9
+size 1325122848

onnx_models/opt_onnx/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}